# Ejercicio 1

In [96]:
import pandas as pd

class NaiveBayesClassifier:

    def train(self, df):
        grouped_classes = df.groupby('Nacionalidad')
        self.classes = df['Nacionalidad'].unique()
        class_amount = len(self.classes)
        self.attributes = df.columns[0: -1]
        self.class_probabilities = {}
        self.attribute_probabilities = {}
        self.total_attribute_probabilities = {}
        for _class, grouped in grouped_classes:
            data_len = len(grouped)
            total_len = len(df.index)
            self.class_probabilities[_class] =data_len / total_len
            self.attribute_probabilities[_class] = {}
            for attribute in df.columns[0:-1]:    # Skipping last column (Nacionalidad)
                self.attribute_probabilities[_class][attribute] = grouped[attribute].sum() /data_len
        for attribute in df.columns[0: -1]:
            self.total_attribute_probabilities[attribute] = df[attribute].sum()/ total_len
    
    def print_probabilities(self, values):
        prediction = {}
        for _class in self.classes:
            probability = self.class_probabilities[_class]
            total_probability = 1.0
            for i,value in enumerate(values):
                if value != 0:
                    probability *= self.attribute_probabilities[_class][self.attributes[i]]
                    total_probability *= self.total_attribute_probabilities[self.attributes[i]] 
            prediction[_class] = probability/total_probability
        for _class,value in prediction.items():
            print(_class,value)


file = 'inputs/PreferenciasBritanicos.xlsx'
brits_df = pd.read_excel(file)
classifier = NaiveBayesClassifier()
classifier.train(brits_df)
# 1.B
classifier.print_probabilities([1, 0, 1, 1, 1])
print()
# 1.C
classifier.print_probabilities([0, 1, 1, 0, 1])

I 0.22885416666666664
E 0.8406887755102038

I 0.4023809523809524
E 0.5912536443148686


# Ejercicio 2

## Paso 1: preprocesamiento del dataset
Primero necesitamos hacer un preprocesamiento de los datos. Para esto decidimos reducir el dataset a 4 categorías, preferencialmente que no guarden demasiada correlación entre ellas en cuanto a las palabras utilizadas y asegurarnos que el modelo logre diferenciarlas correctamente sin tener que lidiar con las otras categorías. Una vez que el objetivo sea cumplido agregaremos devuelta las categorías filtradas para utilizar el dataset completo.

Para el preprocesamiento de los titulos utilizaremos tecnicas utilizadas en NLP y las aplicaremos a los titulos, procedemos a la tokenización de las palabras en los titulos,  

In [100]:
import re
import nltk
import pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_text_spanish(text):
    # Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Tokenization
    tokens = text.split()
    
    # Remove stop words
    stop_words = set(stopwords.words("spanish"))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (use SnowballStemmer for Spanish)
    stemmer = SnowballStemmer("spanish")
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

file = 'inputs/NoticiasArgentinas.xlsx'
news_df = pd.read_excel(file)
accepted_cats = ['Nacional', 'Deportes', 'Salud', 'Ciencia y Tecnologia']
filtered_df = news_df[news_df['categoria'].isin(accepted_cats)]
labels = filtered_df['categoria']

preprocessed_titles = [preprocess_text_spanish(title) for title in filtered_df['titular']] # a cada titular le aplico la función
preprocessed_titles_strings = [" ".join(tokens) for tokens in preprocessed_titles]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_titles_strings)

# Build the vocabulary
vocabulary = vectorizer.get_feature_names_out()
vocabulary_size = len(vocabulary)

class_frequency_map = labels.value_counts(normalize=True).to_dict() #calcula la frecuencia relativa para cada clase
word_freq_per_class = defaultdict(lambda: defaultdict(lambda: 0))
total_tokens_per_class = defaultdict(int)

# Count word occurrences in each class
for i, doc in enumerate(X):
    tokens = preprocessed_titles[i]
    total_tokens_per_class[labels.iloc[i]] += len(tokens)
    for word_idx in doc.indices:
        word = vocabulary[word_idx]
        for token in tokens:
            word_freq_per_class[token][labels.iloc[i]] += 1

# Example new title to classify
new_title = "Paulo Gazzaniga, el arquero sorpresa de Lionel Scaloni"

# Preprocess the new title
new_title_tokens = preprocess_text_spanish(new_title)

# Calculate conditional probabilities
predicted_probabilities = {}
for class_label in class_frequency_map.keys():
    class_frequency = class_frequency_map[class_label]
    prob_word_given_class = 1.0
    for token in new_title_tokens:
        word_freq = word_freq_per_class[token][class_label]
        if word_freq==0:
        # Applying Laplace smoothing
            prob_word_given_class *= (word_freq + 1) / (total_tokens_per_class[class_label] + vocabulary_size)
        else:
             prob_word_given_class*= word_freq / total_tokens_per_class[class_label]
    predicted_probabilities[class_label] = class_frequency *  prob_word_given_class

# Predict the category with the highest probability
print(predicted_probabilities)
predicted_category = max(predicted_probabilities, key=predicted_probabilities.get)
print("Predicted Category:", predicted_category)



{'Nacional': 9.797700858272893e-27, 'Ciencia y Tecnologia': 3.5620939946273e-27, 'Deportes': 3.8314053918028403e-16, 'Salud': 3.3577926800814774e-28}
Predicted Category: Deportes
