In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Kitwe-Local-News-Aggregator-Omdena-\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Kitwe-Local-News-Aggregator-Omdena-'

In [32]:
from dataclasses import dataclass
from typing import List
from pathlib import Path
from typing import Dict

@dataclass
class DataLabellingConfig:
    root_dir:Path
    input_path : Path
    output_path: Path
    titles : str
    texts : str

In [33]:
from src.newsaggregator.constants import *
from src.newsaggregator.utils.common import read_yaml , create_directories
from src.newsaggregator import logger

In [34]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH
                 ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_labelling_config(self) -> DataLabellingConfig:
        config = self.config.data_labelling
        create_directories([config.root_dir])
        
        data_labelling_config = DataLabellingConfig(
            root_dir=Path(config['root_dir']), 
            input_path=Path(config['input_path']),  
            output_path=Path(config['output_path']),
            titles = config['titles'], 
            texts = config['texts']
        )
        return data_labelling_config
    
    

In [39]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import torch


class DataLabelling:
    def __init__(self, config: DataLabellingConfig):
        self.input_path = config.input_path
        self.output_path = config.output_path
        self.titles = config.titles
        self.texts = []
        
        # Load the models and tokenizers
        self.fake_news_tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification")
        self.fake_news_model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")
        
        self.topic_tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
        self.topic_model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
        
        # Check for available device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.fake_news_model.to(self.device)
        self.topic_model.to(self.device)

    def load_data(self):
        """
        Load data from the input CSV file and sample a subset.
        """
        df = pd.read_csv(self.input_path)
        df = df.sample(100)  # Sample 10 random entries
        self.titles = df['Headline'].tolist()
        self.texts = df['Description'].tolist()
        return df
    
    def predict_fake_news(self, threshold=0.5):
        """Predict if news is Fake or Real."""
        predictions = []
        for title, text in zip(self.titles, self.texts):
            input_str = f"<title>{title}<content>{text}<end>"
            encoding = self.fake_news_tokenizer.encode_plus(
                input_str,
                max_length=512,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )

            # Move input tensors to the device
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            with torch.no_grad():
                outputs = self.fake_news_model(input_ids, attention_mask=attention_mask)

            # Get probabilities
            probabilities = torch.nn.Softmax(dim=1)(outputs.logits)
            predictions.append('Fake' if probabilities[0][0].item() > threshold else 'Real')

        return predictions
    
    def predict_topic(self):
        """Predict the topic of each news article."""
        classifier = pipeline("text-classification", model=self.topic_model, tokenizer=self.topic_tokenizer)
        topic_predictions = classifier(self.texts)
        return [pred['label'] for pred in topic_predictions]

    def save_predictions(self, df, fake_news_predictions, topic_predictions):
        """Save the predictions to the output CSV file."""
        # Concatenate predictions with the original DataFrame
            # Ensure the output directory exists
        output_dir = os.path.dirname(self.output_path)
        os.makedirs(output_dir, exist_ok=True)
        df['Target'] = fake_news_predictions
        df['Category'] = topic_predictions
        df.to_csv(self.output_path, index=False)
        print(f"Predictions saved to '{self.output_path}'.")

    def run(self):
        """Run the data labeling process."""
        df = self.load_data()
        fake_news_predictions = self.predict_fake_news()
        topic_predictions = self.predict_topic()
        self.save_predictions(df, fake_news_predictions, topic_predictions)



In [40]:
# Example usage
if __name__ == "__main__":
    config = ConfigurationManager()
    data_labelling_config = config.get_data_labelling_config()
    data_labelling = DataLabelling(data_labelling_config)
    data_labelling.run()

[2024-10-29 18:30:14,338: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-29 18:30:14,386: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-29 18:30:14,403: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-29 18:30:14,413: INFO: common: created directory at: artifacts]
[2024-10-29 18:30:14,420: INFO: common: created directory at: artifacts/data_labeleling]


KeyboardInterrupt: 