In [3]:
import os

In [4]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Kitwe-Local-News-Aggregator-Omdena-\\research'

In [5]:
os.chdir("../")

In [6]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Kitwe-Local-News-Aggregator-Omdena-'

In [78]:
from dataclasses import dataclass
from typing import List
from pathlib import Path
from typing import Dict

@dataclass
class DataCategorizingConfig:
    root_dir: Path
    input_path: Path
    output_path: Path
    categories_keywords: Dict[str, List[str]]
   

In [79]:
from src.newsaggregator.constants import *
from src.newsaggregator.utils.common import read_yaml , create_directories
from src.newsaggregator import logger

In [80]:
import pandas as pd
import yaml
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

In [81]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        logger.info(f"Loaded configuration : {self.config} ")
        
        create_directories([self.config.artifacts_root])
        
    def get_data_labelling_config(self) -> DataCategorizingConfig:
        config = self.config.data_categorizer
        create_directories([config.root_dir])

        # Access the categories_keywords dictionary directly
        data_labelling_config = DataCategorizingConfig(
            root_dir=Path(config['root_dir']), 
            input_path=Path(config['input_path']),  
            output_path=Path(config['output_path']),
            categories_keywords=config['categories_keywords']  # Ensure this is a dict
        )
        
        return data_labelling_config  # Correct indentation


In [88]:
class DataLabelling:
    def __init__(self, config: DataCategorizingConfig):
        self.input_path = config.input_path
        self.output_path = config.output_path
        self.categories_keywords = config.categories_keywords
        
        # Load data
        self.data = self.load_data()
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.knn = KNeighborsClassifier(n_neighbors=5)

    def load_data(self):
        """Load and process initial data."""
        self.data = pd.read_csv(self.input_path)
        self.data['Description'] = self.data['Description'].astype(str)
        return self.data

    def prioritize_category(self, description):
        """Assign a single category based on highest keyword count."""
        keyword_count = {}
        for category, keywords in self.categories_keywords.items():
            count = sum(description.lower().count(keyword) for keyword in keywords)
            if count > 0:
                keyword_count[category] = count
        return max(keyword_count, key=keyword_count.get) if keyword_count else 'Uncategorized'

    def assign_single_categories(self):
        """Apply single category based on keyword prioritization."""
        if self.categories_keywords is None:
            raise ValueError("categories_keywords is None. Ensure config.yaml is loaded correctly.")

        # Assign to 'categories' column
        self.data['categories'] = self.data['Description'].apply(self.prioritize_category)
        # Capitalize the first letter of categories
        self.data['categories'] = self.data['categories'].str.capitalize()

    def train_knn_classifier(self):
        """Train the KNN model to predict categories for uncategorized entries."""
        categorized_df = self.data[self.data['categories'] != 'Uncategorized']
        uncategorized_df = self.data[self.data['categories'] == 'Uncategorized']
        
        # Ensure there is enough data to train
        if categorized_df.empty:
            raise ValueError("No categorized data available for training.")

        # Training data
        X_train = categorized_df['Description']
        y_train = categorized_df['categories']
        
        # Convert text to TF-IDF vectors
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        
        # Train KNN classifier
        self.knn.fit(X_train_tfidf, y_train)
        
        # Predict uncategorized entries
        if not uncategorized_df.empty:
            X_test_tfidf = self.vectorizer.transform(uncategorized_df['Description'])
            y_pred = self.knn.predict(X_test_tfidf)
            self.data.loc[self.data['categories'] == 'Uncategorized', 'categories'] = y_pred

    def save_output(self):
        """Save the categorized DataFrame to the specified output path."""
        self.data.to_csv(self.output_path, index=False)

    def get_categorized_data(self):
        """Return the categorized DataFrame."""
        return self.data

    def categorize(self):
        """Run all categorization steps in sequence."""
        self.assign_single_categories()  # Ensure this is called first
        self.train_knn_classifier()
        self.save_output()
        return self.get_categorized_data()


In [89]:
# Example usage
if __name__ == "__main__":
    config = ConfigurationManager()
    data_labelling_config = config.get_data_labelling_config()
    data_labelling = DataLabelling(data_labelling_config)
    data_labelling.categorize()

[2024-10-30 08:42:59,175: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-30 08:42:59,176: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-30 08:42:59,179: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-30 08:42:59,181: INFO: 1361429372: Loaded configuration : {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'News_websites': {'Copperbelt_Energy': 'https://cecinvestor.com/search/kitwe/feed/rss2/', 'ZNBC': 'https://znbc.co.zm/news/search/kitwe/feed/rss2/', 'News_Invasion_24': 'https://newsinvasion24.com/search/kitwe/feed/rss2/', 'Mwebantu': 'https://www.mwebantu.com/search/kitwe/feed/rss2/', 'Lusaka_Times': 'https://www.lusakatimes.com/search/kitwe/feed/rss2/', 'Kitwe_Online': 'https://kitweonline.com/search/kitwe/feed/rss2/', 'Daily_Revelation_Zambia': 'https://dailyrevelationzambia.com/search/kitwe/feed/rss2/', 'Zambia_Monitor': 'https://www.zambiamonitor.com/search/kitwe/feed/rss