## Installing necessary libraries

In [1]:
!pip install schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [2]:
import nltk
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Data Collection

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def fetch_news_dawn(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    articles = []
    for item in soup.find_all('article'):
        title = item.find('h2').text.strip() if item.find('h2') else 'No title'

        # Extracting date string
        date_tag = item.find('span', class_='timestamp--time')
        date_str = date_tag['title'] if date_tag and 'title' in date_tag.attrs else None

        # Convert date_str to datetime
        if date_str:
            date = pd.to_datetime(date_str)
        else:
            date = None

        content_tag = item.find('div', class_='story__excerpt')
        if content_tag:
            content = content_tag.text.strip()
        else:
            content = 'No content'

        # Only add articles with non-empty content
        if content != 'No content':
            articles.append({'title': title, 'content': content, 'date_str': date_str, 'date': date})

    return articles


## Preprocessing

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces

    text = text.lower()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)



## Classification


In [5]:
from transformers import pipeline, AutoTokenizer, AutoModel

# Initialize the classifier pipeline with a fine-tuned model
classifier = pipeline('text-classification', model='distilbert-base-uncased-finetuned-sst-2-english')

# Function to classify news based on content
def classify_news(text):
    return classifier(text)[0]['label']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


## Similarity Detection

In [6]:
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
transformer_model = AutoModel.from_pretrained("bert-base-uncased")

# Function to compute embeddings using transformers
def compute_transformer_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    outputs = transformer_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Function to find the most similar news using transformers
def find_most_similar_transformers(news_df, embeddings):
    def get_most_similar(idx):
        cosine_similarities = np.dot(embeddings, embeddings[idx]) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(embeddings[idx]))
        most_similar_idx = np.argmax(cosine_similarities)
        return news_df.loc[most_similar_idx, 'content']

    news_df['most_similar_transformers'] = [get_most_similar(i) for i in range(len(news_df))]
    return news_df

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

## Ranking

In [7]:

# Ranking function based on most recent date
def rank_news(group):
    return group.sort_values(by='date', ascending=False)

## Scheduler and Load Balancing

In [11]:
import schedule
import time

# Function to perform the entire task
def scheduled_task():
    # Fetch news data from Dawn website
    url = 'https://www.dawn.com'
    news_data = fetch_news_dawn(url)

    # Create DataFrame from news data
    news_df = pd.DataFrame(news_data)

    # Remove rows with 'No content' in content column
    news_df = news_df[news_df['content'] != 'No content'].reset_index(drop=True)

    # Preprocess content
    news_df['cleaned_content'] = news_df['content'].apply(preprocess_text)

    # Classify news into categories
    news_df['category'] = news_df['cleaned_content'].apply(classify_news)

    # Compute transformer embeddings and find most similar news
    embeddings = compute_transformer_embeddings(news_df['cleaned_content'].tolist())
    news_df = find_most_similar_transformers(news_df, embeddings)

    # Group by category, apply ranking function
    ranked_news = news_df.groupby(['category']).apply(rank_news).reset_index(drop=True)

    # Save ranked news to CSV
    ranked_news.to_csv('ranked_news_data.csv', index=False)

    # Save analysis of categories
    analysis = news_df['category'].value_counts().to_frame().reset_index()
    analysis.columns = ['category', 'count']
    analysis.to_csv('news_analysis.csv', index=False)

    # Print execution count for debugging
    print(f"Task executed at {time.ctime()}")

# Schedule the task to run every 10 seconds (for testing purposes)
schedule.every(10).seconds.do(scheduled_task)


Every 10 seconds do scheduled_task() (last run: [never], next run: 2024-07-20 12:22:35)

## Run The Scheduler

In [12]:
# Function to run the scheduler and stop it based on a condition
def run_scheduler():
    start_time = time.time()
    while True:
        schedule.run_pending()
        time.sleep(1)
        if time.time() > start_time + 30:  # Stop after 30 seconds for testing
            break

# Run the scheduler
run_scheduler()

Task executed at Sat Jul 20 12:22:37 2024
Task executed at Sat Jul 20 12:22:42 2024
Task executed at Sat Jul 20 12:22:53 2024
Task executed at Sat Jul 20 12:23:00 2024
