# Ejercicio 1

In [20]:
import pandas as pd

class NaiveBayesClassifier:

    def train(self, df, alpha=1):
        grouped_classes = df.groupby('Nacionalidad')
        self.classes = df['Nacionalidad'].unique()
        self.attributes = df.columns[0: -1]
        self.class_probabilities = {}
        self.attribute_probabilities = {}
        total_len = len(df)

        for _class, grouped in grouped_classes:
            data_len = len(grouped)
            self.class_probabilities[_class] = (data_len + alpha) / (total_len + alpha * len(self.classes))
            self.attribute_probabilities[_class] = {}
            for attribute in df.columns[0:-1]:    # Skipping last column (Nacionalidad)
                self.attribute_probabilities[_class][attribute] = (grouped[attribute].sum() + alpha) / (data_len + alpha * len(self.attributes))
    
    def print_probabilities(self, values):
        prediction = {}
        for _class in self.classes:
            probability = self.class_probabilities[_class]
            for i,value in enumerate(values):
                if value == 1:
                    probability *= self.attribute_probabilities[_class][self.attributes[i]]
                else: 
                    probability *= 1 - self.attribute_probabilities[_class][self.attributes[i]] # 1 - P(A) = P(not A)
            prediction[_class] = probability
        for _class, value in prediction.items():
            print(_class, value / sum(prediction.values()))  # divide the probability by the sum of all probabilities to get the normalized probability


file = 'inputs/PreferenciasBritanicos.xlsx'
brits_df = pd.read_excel(file)
classifier = NaiveBayesClassifier()
classifier.train(brits_df)
# 1.B
classifier.print_probabilities([1, 0, 1, 1, 1])
print()
# 1.C
classifier.print_probabilities([0, 1, 1, 0, 1])

I 0.25260461382104393
E 0.7473953861789561

I 0.623508361311908
E 0.37649163868809205


# Ejercicio 2

## Paso 1: preprocesamiento del dataset
Primero necesitamos hacer un preprocesamiento de los datos. Para esto decidimos reducir el dataset a 4 categorías, preferencialmente que no guarden demasiada correlación entre ellas en cuanto a las palabras utilizadas y asegurarnos que el modelo logre diferenciarlas correctamente sin tener que lidiar con las otras categorías. Una vez que el objetivo sea cumplido agregaremos devuelta las categorías filtradas para utilizar el dataset completo.

Para el preprocesamiento de los titulos utilizaremos tecnicas utilizadas en NLP y las aplicaremos a los titulos, procedemos a la tokenización de las palabras en los titulos,  

In [19]:
import re
import nltk
import pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

class NewsClassifier:

    def __init__(self):
        self.train_df = None
        self.test_df = None
        self.vocabulary = None
        self.vocabulary_size = None
        self.word_freq_per_class = defaultdict(lambda: defaultdict(lambda: 0))
        self.total_tokens_per_class = defaultdict(int)
        self.class_frequency_map = None
        self.labels = None
        self.confusion_matrix = None

    def preprocess_text_spanish(self,text, use_stopwords, use_stemmer):
        # Convert to lowercase and remove punctuation
        text = re.sub(r'[^\w\s]', '', text.lower())
    
        # Tokenization
        tokens = text.split()
        if use_stopwords:
            # Remove stop words
            stop_words = set(stopwords.words("spanish"))
            tokens = [token for token in tokens if token not in stop_words]
    
        if use_stemmer:
            # Stemming (use SnowballStemmer for Spanish)
            stemmer = SnowballStemmer("spanish")
            tokens = [stemmer.stem(token) for token in tokens]
        return tokens

    def split_dataset(self, test_percentage, dataset_path, categories):
        file = dataset_path
        news_df = pd.read_excel(file)
        accepted_cats = categories
        filtered_df = news_df[news_df['categoria'].isin(accepted_cats)]
        if test_percentage != 0:
            self.train_df, self.test_df = train_test_split(filtered_df, test_size=test_percentage/100, random_state=42)
            self.labels = self.train_df['categoria']
        else:
            self.train_df = filtered_df
            self.labels = filtered_df['categoria']

    def train(self):
        # a cada titular le aplico la función de tokenización
        preprocessed_titles = [self.preprocess_text_spanish(title, True, True) for title in self.train_df['titular']]
        preprocessed_titles_strings = [" ".join(tokens) for tokens in preprocessed_titles]
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(preprocessed_titles_strings)

        # Build the vocabulary
        self.vocabulary = vectorizer.get_feature_names_out()
        self.vocabulary_size = len(self.vocabulary)

        self.class_frequency_map = self.labels.value_counts(normalize=True).to_dict() #calcula la frecuencia relativa para cada clase

        # Count word occurrences in each class
        for i, doc in enumerate(X):
            tokens = preprocessed_titles[i]
            self.total_tokens_per_class[self.labels.iloc[i]] += len(tokens)
            for word_idx in doc.indices:
                word = self.vocabulary[word_idx]
                for token in tokens:
                    self.word_freq_per_class[token][self.labels.iloc[i]] += 1
        
                
    def predict_title(self, new_title):

        # Preprocess the new title
        new_title_tokens = self.preprocess_text_spanish(new_title, False, False)

        # Calculate conditional probabilities
        predicted_probabilities = {}
        for class_label in self.class_frequency_map.keys():
            class_frequency = self.class_frequency_map[class_label]
            prob_word_given_class = class_frequency
            for token in new_title_tokens:
                word_freq = self.word_freq_per_class[token][class_label]
                prob_word_given_class *= (word_freq+1) / (sum(self.word_freq_per_class[token].values()) + len(self.class_frequency_map.keys()))
            predicted_probabilities[class_label] =prob_word_given_class

        # Predict the category with the highest probability
        #print(predicted_probabilities)
        predicted_category = max(predicted_probabilities, key=predicted_probabilities.get)
        #print("Predicted Category:", predicted_category)
        return predicted_category

    def calculate_confusion_matrix_ROC(self,label):
        roc_matrix = {}
        roc_matrix[label] = {}
        roc_matrix['other'] = {}
        roc_matrix[label][label] =0
        roc_matrix[label]['other'] =0
        roc_matrix['other'][label] =0
        roc_matrix['other']['other'] =0
        for index, row in self.test_df.iterrows():
            expected_result = row['categoria']
            predicted_result = self.predict_title(row['titular'])
            if expected_result==predicted_result:
                if expected_result == label:
                    roc_matrix[label][label] += 1
                else:
                   roc_matrix['other']['other'] += 1 
            else:
                if expected_result == label:
                    roc_matrix[label]['other'] += 1
                else:
                    roc_matrix['other'][label] += 1
        return roc_matrix
    
    
    def calculate_confusion_matrix(self):
        label_ocurrences = {}
        self.confusion_matrix = {}
        for label in self.class_frequency_map.keys():
            label_ocurrences[label] = (self.test_df['categoria'] == label).sum()
            self.confusion_matrix[label] = {}
            for column_label in self.class_frequency_map.keys():
                self.confusion_matrix[label][column_label] = 0
        for index, row in self.test_df.iterrows():
            expected_result = row['categoria']
            predicted_result = self.predict_title(row['titular'])
            self.confusion_matrix[expected_result][predicted_result] += 1
    
    def change_confusion_matrix_to_freq(self):
        for row in self.confusion_matrix.keys():
            for column in self.confusion_matrix[row].keys():
                self.confusion_matrix[row][column] /= label_ocurrences[row]
                
    def print_confusion_matrix(self):
        for row in self.confusion_matrix.keys():
            for column in self.confusion_matrix[row].keys():
                print(row, column, self.confusion_matrix[row][column])
        for label in label_ocurrences.keys():
            print(label,":", label_ocurrences[label])
            
    def calculate_accuracy(self, matrix):
        asserted = 0
        total = 0
        for row in matrix.keys():
            for column in matrix.keys():
                if row == column:
                    asserted += matrix[row][column]
                total += matrix[row][column]
        return asserted / total
        
    def calculate_true_positives(self, category, matrix):
        false_negatives = 0
        true_positives = 0
        for column in matrix.keys():
            if column != category:
                false_negatives += matrix[category][column]
            else:
                true_positives += matrix[category][column]             
        return true_positives/(true_positives + false_negatives)
        
    def calculate_precision(self, category, matrix):
        true_positives = 0
        false_positives = 0
        for row in matrix.keys():
            if row != category:
                false_positives += matrix[row][category]
            else:
                true_positives += matrix[row][category]             
        return true_positives/(true_positives + false_positives)
        
    def calculate_false_positives(self, category, matrix):
        true_negatives = 0
        false_positives = 0
        for row in matrix.keys():
            if row != category:
                false_positives += matrix[row][category]
                
        for row in matrix.keys():
            for column in matrix.keys():
                if row == column and row != category:
                    true_negatives += matrix[row][column]
        return false_positives / (false_positives + true_negatives)
        
    def calculate_f1_score(self,category, matrix):
        precision = self.calculate_precision(category, matrix)
        recall = self.calculate_true_positives(category, matrix)
        return (2*precision*recall)/(precision+recall)
                

file = 'inputs/NoticiasArgentinas.xlsx'
categories = ['Nacional', 'Ciencia y Tecnologia', 'Deportes', 'Salud']
classifier = NewsClassifier()
classifier.split_dataset(20, file, categories)
classifier.train()
classifier.predict_title('La próxima semana habrá paro total de transporte')
roc_matrix = classifier.calculate_confusion_matrix_ROC('Nacional')
print("accuracy: ",classifier.calculate_accuracy(roc_matrix))
print("true_positives: ",classifier.calculate_true_positives('Nacional',roc_matrix))
print("precision: ",classifier.calculate_precision('Nacional', roc_matrix))
print("f1-score: ",classifier.calculate_f1_score('Nacional', roc_matrix))
print("false_positives: ",classifier.calculate_false_positives('Nacional', roc_matrix))

accuracy:  0.7362958157638664
true_positives:  0.6763540290620872
precision:  0.4740740740740741
f1-score:  0.5574305933587371
false_positives:  0.24419604471195186
