In [7]:
import xml.etree.ElementTree as ET
import re
import requests
import html
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(content):
    content = html.unescape(content)
    content = re.sub(r'http\S+|www\S+', ' ', content)
    content = re.sub(r'<[^>]*>', ' ', content)
    content = re.sub(r'[^A-Za-z\s]', ' ', content)
    content = re.sub(r'\s+', ' ', content)
    content = content.strip()
    return content

def remove_stopwords(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

def extract_keywords_tfidf(texts, top_n=20):
    vectorizer = TfidfVectorizer(max_features=top_n, ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    sorted_features = sorted(zip(vectorizer.idf_, feature_names))
    keywords = [feature for _, feature in sorted_features[:top_n]]
    return keywords

def read_tags_and_articles(tags_file, num_articles_file):
    with open(tags_file, 'r', encoding='utf-8') as f:
        tags = [line.strip() for line in f.readlines()]

    with open(num_articles_file, 'r', encoding='utf-8') as f:
        num_articles = [int(line.strip().replace(',', '')) for line in f.readlines()]

    # Ensure that both files have the same number of lines
    assert len(tags) == len(num_articles), "Mismatch between number of tags and number of articles."

    # Create a DataFrame
    df = pd.DataFrame({
        'Tag': tags,
        'Number of Articles': num_articles
    })

    return df

def suggest_tags(combined_text, repository_df, top_n=10):
    try:
        if not combined_text or repository_df.empty:
            print("No combined text or repository tags to suggest tags from.")
            return []

        # Vectorize repository tags
        vectorizer = TfidfVectorizer().fit(repository_df['Tag'])
        tag_vectors = vectorizer.transform(repository_df['Tag'])
        combined_vector = vectorizer.transform([combined_text])

        cosine_sim = cosine_similarity(combined_vector, tag_vectors).flatten()
        
        # Get the top_n best matches based on similarity
        best_match_indices = cosine_sim.argsort()[::-1]
        best_matches = [repository_df.iloc[i]['Tag'] for i in best_match_indices if cosine_sim[i] > 0]

        return best_matches[:top_n]
    except Exception as e:
        print(f"Error in tag suggestion: {e}")
        return []

# Example usage
url = 'https://rss.oneindia.com/xml4apps/www.oneindia.com/latest.xml'
response = requests.get(url)
xml_content = response.content

try:
    root = ET.fromstring(xml_content)
except ET.ParseError:
    print("Error parsing XML content.")
    root = None

if root is not None:
    titles = []
    links = []
    summaries = []
    tags = []
    contents = []
    keywords_list = []
    suggested_tags_list = []

    # Define file paths
    tags_file = 'updated_tags.txt'
    num_articles_file = 'num_articles.txt'

    # Read tags and articles
    repository_df = read_tags_and_articles(tags_file, num_articles_file)

    for item in root.findall('.//Item'):
        title = item.find('Title').text
        link = item.find('Link').text
        summary = item.find('Summary').text
        tag = item.find('Tags').text

        try:
            linked_response = requests.get(link)
            linked_response.raise_for_status()  # Check if the request was successful
            linked_xml_content = linked_response.content

            try:
                linked_root = ET.fromstring(linked_xml_content)
            except ET.ParseError:
                print(f"Error parsing linked XML content from {link}")
                continue

            content = linked_root.find('.//Content').text if linked_root.find('.//Content') is not None else ''

            preprocessed_title = preprocess_text(title)
            preprocessed_summary = preprocess_text(summary)
            preprocessed_content = preprocess_text(content)

            combined_text = f"{preprocessed_title} {preprocessed_summary} {preprocessed_content}"
            cleaned_combined_text = remove_stopwords(combined_text)

            titles.append(title)
            links.append(link)
            summaries.append(preprocessed_summary)
            tags.append(tag)
            contents.append(preprocessed_content)

            # Extract keywords using TF-IDF
            tfidf_keywords = extract_keywords_tfidf([cleaned_combined_text], top_n=20)
            keywords_list.append(tfidf_keywords)

            # Suggest tags from repository tags based on similarity
            suggested_tags = suggest_tags(cleaned_combined_text, repository_df, top_n=10)
            suggested_tags_list.append(suggested_tags)
        except requests.RequestException as e:
            print(f"Error fetching linked content from {link}: {e}")

    for i in range(len(titles)):
        print('Title ==> ', titles[i])
        print('Link ==> ', links[i])
        print("Summary ==> ", summaries[i])
        print("Tags ==> ", tags[i])
        print('Extracted Keywords ==> ', keywords_list[i])
        print('Suggested Tags ==> ', suggested_tags_list[i])
        print("Content ==> ", contents[i])
        print('=============================================================================================================================')
else:
    print("No valid XML content found in the initial feed.")


Error fetching linked content from https://rss.oneindia.com/xml4apps/www.oneindia.com/sports/3894679.xml: 500 Server Error: Internal Server Error for url: https://rss.oneindia.com/xml4apps/www.oneindia.com/sports/3894679.xml
Title ==>  Cloudburst In Himachal Pradesh: 1 Dead, 32 Missing
Link ==>  https://rss.oneindia.com/xml4apps/www.oneindia.com/india/3894755.xml
Summary ==>  One person has died and at least are missing after cloudburst incidents in Himachal Pradesh on Thursday The incident occurred near a hydroelectric power project at Samej Khad in Shimla district s Rampur area A senior administrative officer told The Indian
Tags ==>  cloudburst, himachal pradesh
Extracted Keywords ==>  ['bjp', 'cloudburst', 'district', 'himachal', 'himachal pradesh', 'incident', 'minister', 'missing', 'missing cloudburst', 'nadda', 'person died', 'police', 'power', 'power project', 'pradesh', 'president', 'region', 'reports', 'rescue', 'thursday']
Suggested Tags ==>  ['cloudburst', 'himachal pradesh