## Installing necessary libraries

In [None]:
!pip install schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [None]:
!pip install APScheduler

Collecting APScheduler
  Downloading APScheduler-3.10.4-py3-none-any.whl (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: APScheduler
Successfully installed APScheduler-3.10.4


In [None]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Data Collection

In [49]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to fetch news from Dawn website
def fetch_news_dawn(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    articles = []
    for item in soup.find_all('article'):
        title = item.find('h2').text.strip() if item.find('h2') else 'No title'

        # Extracting date string
        date_tag = item.find('span', class_='timestamp--time')
        date_str = date_tag['title'] if date_tag and 'title' in date_tag.attrs else None

        # Convert date_str to datetime
        if date_str:
            date = pd.to_datetime(date_str)
        else:
            date = None

        content_tag = item.find('div', class_='story__excerpt')
        if content_tag:
            content = content_tag.text.strip()
        else:
            content = 'No content'

        # Only add articles with non-empty content
        if content != 'No content':
            articles.append({'title': title, 'content': content, 'date_str': date_str, 'date': date})

    return articles


## Preprocessing

In [50]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces

    text = text.lower()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

## Classification


In [51]:
from transformers import pipeline, AutoTokenizer, AutoModel

# Initialize the classifier pipeline with a fine-tuned model
classifier = pipeline('text-classification', model='distilbert-base-uncased-finetuned-sst-2-english')

# Function to classify news based on content
def classify_news(text):
    return classifier(text)[0]['label']

## Adding Subcategories

In [52]:
# Function to determine subcategory based on cleaned content
def determine_subcategory(row):
    text = row['cleaned_content']

    # Sports subcategories
    if 'cricket' in text:
        return 'Cricket'
    elif 'football' in text or 'soccer' in text:
        return 'Football'
    elif 'hockey' in text:
        return 'Hockey'
    elif 'tennis' in text:
        return 'Tennis'
    elif 'basketball' in text:
        return 'Basketball'
    elif 'golf' in text:
        return 'Golf'
    elif 'rugby' in text:
        return 'Rugby'
    elif 'athletics' in text:
        return 'Athletics'

    # Business subcategories
    elif 'crypto' in text or 'cryptocurrency' in text or 'bitcoin' in text:
        return 'Crypto'
    elif 'stock' in text or 'market' in text or 'exchange' in text:
        return 'Stock Exchanges'
    elif 'finance' in text or 'economy' in text or 'bank' in text or 'investment' in text:
        return "Pakistan's Financial News"
    elif 'real estate' in text or 'property' in text:
        return 'Real Estate'
    elif 'trade' in text or 'commerce' in text:
        return 'Trade'
    elif 'oil' in text or 'gas' in text or 'energy' in text:
        return 'Energy'
    elif 'technology' in text or 'tech' in text or 'innovation' in text:
        return 'Technology'
    elif 'startup' in text or 'entrepreneur' in text:
        return 'Startups'

    # Culture subcategories
    elif 'music' in text or 'showbiz' in text or 'celebrity' in text or 'film' in text or 'movie' in text or 'theater' in text:
        return 'Music/Showbiz/Celebs'
    elif 'art' in text or 'gallery' in text or 'painting' in text or 'sculpture' in text:
        return 'Art'
    elif 'literature' in text or 'book' in text or 'poetry' in text or 'novel' in text or 'author' in text:
        return 'Literature'
    elif 'fashion' in text or 'style' in text or 'design' in text:
        return 'Fashion'
    elif 'food' in text or 'cuisine' in text or 'restaurant' in text:
        return 'Food'

    # Politics subcategories
    elif 'politic' in text or 'government' in text or 'election' in text or 'policy' in text:
        return 'Politics'
    elif 'corruption' in text or 'scandal' in text or 'bribery' in text:
        return 'Corruption'
    elif 'law' in text or 'court' in text or 'justice' in text:
        return 'Law'
    elif 'diplomacy' in text or 'foreign policy' in text:
        return 'Diplomacy'

    # International subcategories
    elif 'international' in text or 'world' in text or 'foreign' in text:
        return 'International'
    elif 'asia' in text or 'china' in text or 'india' in text or 'japan' in text or 'korea' in text:
        return 'Asia'
    elif 'europe' in text or 'european' in text or 'germany' in text or 'france' in text or 'uk' in text or 'england' in text:
        return 'Europe'
    elif 'america' in text or 'us' in text or 'usa' in text or 'canada' in text:
        return 'America'
    elif 'africa' in text or 'african' in text:
        return 'Africa'
    elif 'middle east' in text or 'arab' in text or 'iran' in text or 'saudi' in text:
        return 'Middle East'
    elif 'oceania' in text or 'australia' in text or 'new zealand' in text:
        return 'Oceania'

    # National subcategories
    elif 'pakistan' in text or 'pakistani' in text:
        return 'National'

    # Technology subcategories
    elif 'tech' in text or 'technology' in text or 'innovation' in text:
        return 'Technology'
    elif 'ai' in text or 'artificial intelligence' in text or 'machine learning' in text or 'data science' in text:
        return 'AI/ML/Data Science'
    elif 'software' in text or 'programming' in text or 'coding' in text:
        return 'Software Development'
    elif 'hardware' in text or 'computer' in text or 'electronics' in text:
        return 'Hardware'
    elif 'internet' in text or 'web' in text or 'online' in text:
        return 'Internet'
    elif 'social media' in text or 'facebook' in text or 'twitter' in text or 'instagram' in text:
        return 'Social Media'

    # Health subcategories
    elif 'health' in text or 'medicine' in text or 'hospital' in text or 'doctor' in text or 'nurse' in text:
        return 'Health'
    elif 'covid' in text or 'coronavirus' in text or 'pandemic' in text:
        return 'COVID-19'
    elif 'mental health' in text or 'psychology' in text or 'therapy' in text:
        return 'Mental Health'
    elif 'nutrition' in text or 'diet' in text or 'fitness' in text:
        return 'Nutrition/Fitness'

    # Education subcategories
    elif 'education' in text or 'school' in text or 'college' in text or 'university' in text or 'student' in text:
        return 'Education'
    elif 'scholarship' in text or 'grant' in text or 'fellowship' in text:
        return 'Scholarships/Grants'

    # Environment subcategories
    elif 'environment' in text or 'climate' in text or 'pollution' in text or 'conservation' in text:
        return 'Environment'
    elif 'wildlife' in text or 'animal' in text or 'biodiversity' in text:
        return 'Wildlife'
    elif 'sustainability' in text or 'renewable' in text or 'green' in text:
        return 'Sustainability'

    # Miscellaneous subcategories
    elif 'travel' in text or 'tourism' in text or 'vacation' in text:
        return 'Travel/Tourism'
    elif 'automobile' in text or 'car' in text or 'bike' in text:
        return 'Automobile'
    elif 'space' in text or 'nasa' in text or 'astronomy' in text:
        return 'Space/Astronomy'

    # Default to 'Other' if no specific subcategory found
    else:
        return 'Other'


## Similarity Detection

In [53]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
transformer_model = AutoModel.from_pretrained("bert-base-uncased")
# Function to create and train a Doc2Vec model
def train_doc2vec_model(news_df):
    tagged_data = [TaggedDocument(words=row['cleaned_content'].split(), tags=[str(i)]) for i, row in news_df.iterrows()]
    doc2vec_model = Doc2Vec(vector_size=50, alpha=0.025, min_alpha=0.025, min_count=1, dm=1)
    doc2vec_model.build_vocab(tagged_data)
    for epoch in range(100):
        doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
        doc2vec_model.alpha -= 0.0002
        doc2vec_model.min_alpha = doc2vec_model.alpha
    return doc2vec_model

# Function to find the most similar news using Doc2Vec
def find_most_similar_doc2vec(news_df, model):
    def get_most_similar(doc):
        inferred_vector = model.infer_vector(doc.split())
        sims = model.dv.most_similar([inferred_vector], topn=len(news_df))
        most_similar_idx = int(sims[0][0])
        return news_df.loc[most_similar_idx, 'content']

    news_df['most_similar_doc2vec'] = news_df['cleaned_content'].apply(get_most_similar)
    return news_df

# Function to compute embeddings using transformers
def compute_transformer_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    outputs = transformer_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Function to find the most similar news using transformers
def find_most_similar_transformers(news_df, embeddings):
    def get_most_similar(idx):
        cosine_similarities = np.dot(embeddings, embeddings[idx]) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(embeddings[idx]))
        most_similar_idx = np.argmax(cosine_similarities)
        return news_df.loc[most_similar_idx, 'content']

    news_df['most_similar_transformers'] = [get_most_similar(i) for i in range(len(news_df))]
    return news_df

## Ranking

In [54]:
    # ranking function based on most recent date
    def rank_news(group):
        return group.sort_values(by='date', ascending=False)

## Scheduler and Load Balancing

In [55]:
import schedule
import time
# Function to perform the entire task
def scheduled_task():
    # Fetch news data from Dawn website
    url = 'https://www.dawn.com'
    news_data = fetch_news_dawn(url)

    # Create DataFrame from news data
    news_df = pd.DataFrame(news_data)

    # Remove rows with 'No content' in content column
    news_df = news_df[news_df['content'] != 'No content'].reset_index(drop=True)

    # Preprocess content
    news_df['cleaned_content'] = news_df['content'].apply(preprocess_text)

    # Classify news into categories
    news_df['category'] = news_df['cleaned_content'].apply(classify_news)

    # Determine subcategory
    news_df['subcategory'] = news_df.apply(determine_subcategory, axis=1)

    # Train Doc2Vec model and find most similar news
    doc2vec_model = train_doc2vec_model(news_df)
    news_df = find_most_similar_doc2vec(news_df, doc2vec_model)

    # Compute transformer embeddings and find most similar news
    embeddings = compute_transformer_embeddings(news_df['cleaned_content'].tolist())
    news_df = find_most_similar_transformers(news_df, embeddings)

    # Group by category, apply ranking function
    ranked_news = news_df.groupby(['category']).apply(rank_news).reset_index(drop=True)

    # Save ranked news to CSV
    ranked_news.to_csv('ranked_news_data.csv', index=False)

    # Save analysis of categories
    analysis = news_df['category'].value_counts().to_frame().reset_index()
    analysis.columns = ['category', 'count']
    analysis.to_csv('news_analysis.csv', index=False)

    # Print execution count for debugging
    print(f"Task executed at {time.ctime()}")

# Schedule the task to run every 10 seconds (for testing purposes)
schedule.every(10).seconds.do(scheduled_task)

Every 10 seconds do scheduled_task() (last run: [never], next run: 2024-07-09 08:58:13)

## Run The Scheduler

In [56]:
# Function to run the scheduler and stop it based on a condition
def run_scheduler():
    start_time = time.time()
    while True:
        schedule.run_pending()
        time.sleep(1)
        if time.time() > start_time + 30:  # Stop after 30 seconds for testing
            break

# Run the scheduler
run_scheduler()

Task executed at Tue Jul  9 08:58:20 2024
Task executed at Tue Jul  9 08:58:33 2024
Task executed at Tue Jul  9 08:58:37 2024
Task executed at Tue Jul  9 08:58:44 2024
Task executed at Tue Jul  9 08:58:47 2024
Task executed at Tue Jul  9 08:58:50 2024
Task executed at Tue Jul  9 08:59:00 2024
Task executed at Tue Jul  9 08:59:11 2024
Task executed at Tue Jul  9 08:59:20 2024
Task executed at Tue Jul  9 08:59:31 2024
Task executed at Tue Jul  9 08:59:41 2024
