# Ejercicio 1

In [65]:
import pandas as pd

class NaiveBayesClassifier:

    def train(self, df, alpha=1):
        grouped_classes = df.groupby('Nacionalidad')
        self.classes = df['Nacionalidad'].unique()
        self.attributes = df.columns[0: -1]
        self.class_probabilities = {}
        self.attribute_probabilities = {}
        total_len = len(df)

        for _class, grouped in grouped_classes:
            data_len = len(grouped)
            self.class_probabilities[_class] = data_len/ total_len
            self.attribute_probabilities[_class] = {}
            for attribute in df.columns[0:-1]:    # Skipping last column (Nacionalidad)
                self.attribute_probabilities[_class][attribute] = (grouped[attribute].sum() + alpha) / (data_len + alpha * len(self.attributes))
    
    def print_probabilities(self, values):
        prediction = {}
        for _class in self.classes:
            probability = self.class_probabilities[_class]
            for i,value in enumerate(values):
                if value == 1:
                    probability *= self.attribute_probabilities[_class][self.attributes[i]]
                else: 
                    probability *= 1 - self.attribute_probabilities[_class][self.attributes[i]] # 1 - P(A) = P(not A)
            prediction[_class] = probability
        for _class, value in prediction.items():
            print(_class, value / sum(prediction.values()))  # divide the probability by the sum of all probabilities to get the normalized probability


file = 'inputs/PreferenciasBritanicos.xlsx'
brits_df = pd.read_excel(file)
classifier = NaiveBayesClassifier()
classifier.train(brits_df)
# 1.B
classifier.print_probabilities([1, 0, 1, 1, 1])
print()
# 1.C
classifier.print_probabilities([0, 1, 1, 0, 1])

I 0.24873167822311673
E 0.7512683217768833

I 0.6186558869646712
E 0.3813441130353287


# Ejercicio 2

## Paso 1: preprocesamiento del dataset
Primero necesitamos hacer un preprocesamiento de los datos. Para esto decidimos reducir el dataset a 4 categorías, preferencialmente que no guarden demasiada correlación entre ellas en cuanto a las palabras utilizadas y asegurarnos que el modelo logre diferenciarlas correctamente sin tener que lidiar con las otras categorías. Una vez que el objetivo sea cumplido agregaremos devuelta las categorías filtradas para utilizar el dataset completo.

Para el preprocesamiento de los titulos utilizaremos tecnicas utilizadas en NLP y las aplicaremos a los titulos, procedemos a la tokenización de las palabras en los titulos,  

In [17]:
import re
import nltk 
#nltk.download("stopwords")
#nltk.download("snowball_data")
import pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

class NewsClassifier:

    def __init__(self):
        self.train_df = None
        self.test_df = None
        self.vocabulary = set()
        self.vocabulary_size = None
        self.word_occurencies_per_class = {}
        self.total_tokens_per_class = defaultdict(int)
        self.class_frequency_map = None
        self.labels = None
        self.confusion_matrix = None

    def preprocess_text_spanish(self,text, use_stopwords, use_stemmer):
        # Convert to lowercase and remove punctuation
        text = re.sub(r'[^\w\s]', '', text.lower())
    
        # Tokenization
        tokens = text.split()
        if use_stopwords:
            # Remove stop words
            stop_words = set(stopwords.words("spanish"))
            tokens = [token for token in tokens if token not in stop_words]
    
        if use_stemmer:
            # Stemming (use SnowballStemmer for Spanish)
            stemmer = SnowballStemmer("spanish")
            tokens = [stemmer.stem(token) for token in tokens]
        return tokens

    def split_dataset(self, test_percentage, dataset_path, categories):
        file = dataset_path
        news_df = pd.read_excel(file)
        accepted_cats = categories
        filtered_df = news_df[news_df['categoria'].isin(accepted_cats)]
        
        if test_percentage != 0:
            self.train_df, self.test_df = train_test_split(filtered_df, test_size=test_percentage/100, random_state=42)
            self.labels = self.train_df['categoria']
        else:
            self.train_df = filtered_df
            self.labels = filtered_df['categoria']

    def train(self, alpha=1):
        matrix_per_category = {}
        # Apply CountVectorizer to the titles per category
        for label in self.labels.unique():
            category_df = self.train_df[self.train_df['categoria'] == label]
            preprocessed_titles = [self.preprocess_text_spanish(title, False, False) for title in category_df["titular"]]
            self.vocabulary = set(self.vocabulary).union(set([token for tokens in preprocessed_titles for token in tokens]))
            preprocessed_titles_strings = [" ".join(tokens) for tokens in preprocessed_titles]
            vectorizer = CountVectorizer()
            X = vectorizer.fit_transform(preprocessed_titles_strings)
            count_matrix = X.toarray()
            df = pd.DataFrame(count_matrix, columns=vectorizer.get_feature_names_out())
            matrix_per_category[label] = df


        self.vocabulary_size = len(self.vocabulary)
        self.class_frequency_map = {}

        for label in self.labels.unique():
            self.class_frequency_map[label] = (self.train_df['categoria'] == label).sum() / self.train_df.shape[0]

        # Count word occurrences in each class
        for label in self.labels.unique():
            self.word_occurencies_per_class[label] = defaultdict(int)
            for word in matrix_per_category[label].columns:
                self.word_occurencies_per_class[label][word] = matrix_per_category[label][word].sum() 
            self.total_tokens_per_class[label] = matrix_per_category[label].sum().sum()     

        self.conditional_probabilities = {}
        # Calculate conditional probabilities for each word in each class
        for label in self.labels.unique():
            self.conditional_probabilities[label] = {}
            for word in self.vocabulary:
                self.conditional_probabilities[label][word] = (self.word_occurencies_per_class[label][word] + alpha) / (self.total_tokens_per_class[label] + alpha * self.total_tokens_per_class[label])
            
                
    def predict_title(self, new_title):
        # Preprocess the new title
        new_title_tokens = self.preprocess_text_spanish(new_title, False, False)

        # Calculate probabilities for each class
        predicted_probabilities = {}
        for class_label, class_frequency in self.class_frequency_map.items():
            prob_word_given_class = class_frequency  # P(cat|token) = P(cat) * P(token|cat) = P(cat) * P(token1|cat) * P(token2|cat) * ... * P(tokenN|cat) 
            for token in new_title_tokens:
                if token in self.vocabulary:
                    prob_word_given_class *= self.conditional_probabilities[class_label][token]
            predicted_probabilities[class_label] = prob_word_given_class

        # Divide by the sum of all probabilities to get the normalized probability
        total_probability = sum(predicted_probabilities.values())
        for class_label in predicted_probabilities.keys():
            predicted_probabilities[class_label] = predicted_probabilities[class_label] / total_probability
        # Return the class with the highest probability
        return max(predicted_probabilities, key=predicted_probabilities.get), predicted_probabilities
       


    def calculate_confusion_matrix_ROC(self,label):
        roc_matrix = {}
        roc_matrix[label] = {}
        roc_matrix['other'] = {}
        roc_matrix[label][label] =0
        roc_matrix[label]['other'] =0
        roc_matrix['other'][label] =0
        roc_matrix['other']['other'] =0
        for index, row in self.test_df.iterrows():
            expected_result = row['categoria']
            predicted_result,_ = self.predict_title(row['titular'])
            if expected_result == predicted_result:
                if expected_result == label:
                    print(predicted_result,":",expected_result, ":", label)
                    roc_matrix[label][label] += 1
                else:
                   roc_matrix['other']['other'] += 1 
            else:
                if expected_result == label:
                    roc_matrix[label]['other'] += 1
                else:
                    roc_matrix['other'][label] += 1
        return roc_matrix
    
    
    def calculate_confusion_matrix(self):
        self.confusion_matrix = {}
        for label in self.class_frequency_map.keys():
            self.confusion_matrix[label] = {}
            for column_label in self.class_frequency_map.keys():
                self.confusion_matrix[label][column_label] = 0
        for index, row in self.test_df.iterrows():
            expected_result = row['categoria']
            predicted_result,_ = self.predict_title(row['titular'])
            self.confusion_matrix[expected_result][predicted_result] += 1
    
    def change_confusion_matrix_to_freq(self):
        label_ocurrences = {}
        label_ocurrences[label] = (self.test_df['categoria'] == label).sum()
        for row in self.confusion_matrix.keys():
            for column in self.confusion_matrix[row].keys():
                self.confusion_matrix[row][column] /= label_ocurrences[row]
                
    def print_confusion_matrix(self, matrix):
        for row in matrix.keys():
            for column in matrix[row].keys():
                print(row, column, matrix[row][column])
            
    def calculate_accuracy(self, matrix):
        asserted = 0
        total = 0
        for row in matrix.keys():
            for column in matrix.keys():
                if row == column:
                    asserted += matrix[row][column]
                total += matrix[row][column]
        return asserted / total
        
    def calculate_true_positives(self, category, matrix):
        false_negatives = 0
        true_positives = 0
        for column in matrix.keys():
            if column != category:
                false_negatives += matrix[category][column]
            else:
                true_positives += matrix[category][column]             
        return true_positives/(true_positives + false_negatives)
        
    def calculate_precision(self, category, matrix):
        true_positives = 0
        false_positives = 0
        for row in matrix.keys():
            if row != category:
                false_positives += matrix[row][category]
            else:
                true_positives += matrix[row][category]             
        return true_positives/(true_positives + false_positives)
        
    def calculate_false_positives(self, category, matrix):
        true_negatives = 0
        false_positives = 0
        for row in matrix.keys():
            if row != category:
                false_positives += matrix[row][category]
                
        for row in matrix.keys():
            for column in matrix.keys():
                if row == column and row != category:
                    true_negatives += matrix[row][column]
        return false_positives / (false_positives + true_negatives)
        
    def calculate_f1_score(self,category, matrix):
        precision = self.calculate_precision(category, matrix)
        recall = self.calculate_true_positives(category, matrix)
        return (2*precision*recall)/(precision+recall)
                

file = 'inputs/NoticiasArgentinas.xlsx'
categories = ['Nacional', "Destacadas", "Entretenimiento", "Salud", "Deportes"]
classifier = NewsClassifier()
classifier.split_dataset(40, file, categories) # 40% test, 60% train
classifier.train()
roc_matrix = classifier.calculate_confusion_matrix_ROC('Nacional')
classifier.print_confusion_matrix(roc_matrix)
# print("accuracy: ",classifier.calculate_accuracy(roc_matrix))
# print("true_positives: ",classifier.calculate_true_positives('Nacional',roc_matrix))
# print("precision: ",classifier.calculate_precision('Nacional', roc_matrix))
# print("f1-score: ",classifier.calculate_f1_score('Nacional', roc_matrix))
# print("false_positives: ",classifier.calculate_false_positives('Nacional', roc_matrix))

Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional : Nacional : Nacional
Nacional