# Ejercicio 1

In [1]:
import pandas as pd

class NaiveBayesClassifier:

    def train(self, df, alpha=1):
        grouped_classes = df.groupby('Nacionalidad')
        self.classes = df['Nacionalidad'].unique()
        self.attributes = df.columns[0: -1]
        self.class_probabilities = {}
        self.attribute_probabilities = {}
        total_len = len(df)

        for _class, grouped in grouped_classes:
            data_len = len(grouped)
            self.class_probabilities[_class] = (data_len + alpha) / (total_len + alpha * len(self.classes))
            self.attribute_probabilities[_class] = {}
            for attribute in df.columns[0:-1]:    # Skipping last column (Nacionalidad)
                self.attribute_probabilities[_class][attribute] = (grouped[attribute].sum() + alpha) / (data_len + alpha * len(self.attributes))
    
    def print_probabilities(self, values):
        prediction = {}
        for _class in self.classes:
            probability = self.class_probabilities[_class]
            for i,value in enumerate(values):
                if value == 1:
                    probability *= self.attribute_probabilities[_class][self.attributes[i]]
                else: 
                    probability *= 1 - self.attribute_probabilities[_class][self.attributes[i]] # 1 - P(A) = P(not A)
            prediction[_class] = probability
        for _class, value in prediction.items():
            print(_class, value / sum(prediction.values()))  # divide the probability by the sum of all probabilities to get the normalized probability


file = 'inputs/PreferenciasBritanicos.xlsx'
brits_df = pd.read_excel(file)
classifier = NaiveBayesClassifier()
classifier.train(brits_df)
# 1.B
classifier.print_probabilities([1, 0, 1, 1, 0])
print()
# 1.C
classifier.print_probabilities([0, 1, 1, 0, 1])

I 0.2282357113077981
E 0.7717642886922018

I 0.623508361311908
E 0.37649163868809205


# Ejercicio 2

## Paso 1: preprocesamiento del dataset
Primero necesitamos hacer un preprocesamiento de los datos. Para esto decidimos reducir el dataset a 4 categorías, preferencialmente que no guarden demasiada correlación entre ellas en cuanto a las palabras utilizadas y asegurarnos que el modelo logre diferenciarlas correctamente sin tener que lidiar con las otras categorías. Una vez que el objetivo sea cumplido agregaremos devuelta las categorías filtradas para utilizar el dataset completo.

Para el preprocesamiento de los titulos utilizaremos tecnicas utilizadas en NLP y las aplicaremos a los titulos, procedemos a la tokenización de las palabras en los titulos,  

In [100]:
import re
import nltk
import pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_text_spanish(text):
    # Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Tokenization
    tokens = text.split()
    
    # Remove stop words
    stop_words = set(stopwords.words("spanish"))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (use SnowballStemmer for Spanish)
    stemmer = SnowballStemmer("spanish")
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

file = 'inputs/NoticiasArgentinas.xlsx'
news_df = pd.read_excel(file)
accepted_cats = ['Nacional', 'Deportes', 'Salud', 'Ciencia y Tecnologia']
filtered_df = news_df[news_df['categoria'].isin(accepted_cats)]
labels = filtered_df['categoria']

preprocessed_titles = [preprocess_text_spanish(title) for title in filtered_df['titular']] # a cada titular le aplico la función
preprocessed_titles_strings = [" ".join(tokens) for tokens in preprocessed_titles]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_titles_strings)

# Build the vocabulary
vocabulary = vectorizer.get_feature_names_out()
vocabulary_size = len(vocabulary)

class_frequency_map = labels.value_counts(normalize=True).to_dict() #calcula la frecuencia relativa para cada clase
word_freq_per_class = defaultdict(lambda: defaultdict(lambda: 0))
total_tokens_per_class = defaultdict(int)

# Count word occurrences in each class
for i, doc in enumerate(X):
    tokens = preprocessed_titles[i]
    total_tokens_per_class[labels.iloc[i]] += len(tokens)
    for word_idx in doc.indices:
        word = vocabulary[word_idx]
        for token in tokens:
            word_freq_per_class[token][labels.iloc[i]] += 1

# Example new title to classify
new_title = "Paulo Gazzaniga, el arquero sorpresa de Lionel Scaloni"

# Preprocess the new title
new_title_tokens = preprocess_text_spanish(new_title)

# Calculate conditional probabilities
predicted_probabilities = {}
for class_label in class_frequency_map.keys():
    class_frequency = class_frequency_map[class_label]
    prob_word_given_class = 1.0
    for token in new_title_tokens:
        word_freq = word_freq_per_class[token][class_label]
        if word_freq==0:
        # Applying Laplace smoothing
            prob_word_given_class *= (word_freq + 1) / (total_tokens_per_class[class_label] + vocabulary_size)
        else:
             prob_word_given_class*= word_freq / total_tokens_per_class[class_label]
    predicted_probabilities[class_label] = class_frequency *  prob_word_given_class

# Predict the category with the highest probability
print(predicted_probabilities)
predicted_category = max(predicted_probabilities, key=predicted_probabilities.get)
print("Predicted Category:", predicted_category)



{'Nacional': 9.797700858272893e-27, 'Ciencia y Tecnologia': 3.5620939946273e-27, 'Deportes': 3.8314053918028403e-16, 'Salud': 3.3577926800814774e-28}
Predicted Category: Deportes


# Ejercicio 3

In [2]:
import pandas as pd

## Preprocessing
file = 'inputs/binary.csv'
students_df = pd.read_csv(file)
student_amount = len(students_df)

# Creating auxiliar probabilites matrix
aux_probabilities = []
for admit in [0, 1]:
    for gre in [0, 1]:
        for gpa in [0, 1]:
            for rank in [1, 2, 3, 4]:
                aux_probabilities.append([admit, gre, gpa, rank, 0])
                
probs_df = pd.DataFrame(data=aux_probabilities, columns=['admit', 'gre', 'gpa', 'rank', 'probability'])

for student_id in students_df.index:
    admit = students_df['admit'][student_id]
    rank = students_df['rank'][student_id]
    discrete_gre = 1 if students_df['gre'][student_id] >= 500 else 0
    discrete_gpa = 1 if students_df['gpa'][student_id] >= 3 else 0
    probs_df.loc[(probs_df['admit'] == admit) & (probs_df['rank'] == rank) & (probs_df['gre'] == discrete_gre) & (probs_df['gpa'] == discrete_gpa), 'probability'] += 1 / student_amount

# 3.a -> P(!admitido | rango=1) = P(!admitido ^ rango=1) / P(rango=1)
# P(!admitido ^ rango=1) = Sum_GPE(0,1){Sum_GPA(0,1){P(!admitido ^ GPE ^ GPA ^ rango=1}} = P(!admitido ^ !gpe ^ !gpa ^ rango = 1) * ... * P(!admitido ^ gpe ^ gpa ^ rango = 1)
# Ej: P(!admitido ^ !gpe ^ !gpa ^ rango=1) = P(!admitido | !gpe ^ !gpa ^ rango=1) * P(!gpe | rango=1) * P(!gpa | rango=1) * P(rango=1)
# p1: P(!admitido | !gpe ^ !gpa ^ rango=1)
# p2: P(!gpe | rango=1)
# p3: P(!gpa | rango=1)
probability_3a = 0
p_rango1 = sum(probs_df.loc[probs_df['rank'] == 1, 'probability'])

for gre in [0, 1]:
    for gpa in [0, 1]:
        p1_parcial = probs_df.loc[(probs_df['admit'] == 0) & (probs_df['gre'] == gre) & (probs_df['gpa'] == gpa) & (probs_df['rank'] == 1), 'probability']
        p1_tot = sum(probs_df.loc[(probs_df['gre'] == gre) & (probs_df['gpa'] == gpa) & (probs_df['rank'] == 1), 'probability'])
        p1 = p1_parcial / p1_tot

        p2_parcial = sum(probs_df.loc[(probs_df['gre'] == gre) & (probs_df['rank'] == 1), 'probability'])
        p2 = p2_parcial / p_rango1

        p3_parcial = sum(probs_df.loc[(probs_df['gpa'] == gpa) & (probs_df['rank'] == 1), 'probability'])
        p3 = p3_parcial / p_rango1

        probability_3a += float((p1 * p2 * p3).iloc[0])

print('3a: ', probability_3a)

# 3.b -> P(admitido | r=2 ^ !gre ^ gpa) = P(admitido ^ r=2 ^ !gre ^ gpa) / P(r=2 ^ !gre ^ gpa)
p1 = probs_df.loc[(probs_df['admit'] == 1) & (probs_df['gre'] == 0) & (probs_df['gpa'] == 1) & (probs_df['rank'] == 2), 'probability']
p2 = sum(probs_df.loc[(probs_df['gre'] == 0) & (probs_df['gpa'] == 1) & (probs_df['rank'] == 2), 'probability'])
prob_3b = float((p1 / p2).iloc[0])
print('3b: ', prob_3b)

3a:  0.41546427121512747
3b:  0.19047619047619047


El proceso de aprendizaje es **paramétrico**, pues ya se conoce la estructura de condiciones. Por ende, no se necesita del algoritmo k2 para identificarla.