# Naive Bayes - Ejemplo de Clasificación

In [None]:
#!wget http://downloads.dbpedia.org/2016-10/core-i18n/es/long_abstracts_es.tql.bz2

In [None]:
#!bunzip2 -c long_abstracts_es.tql.bz2 | head

In [None]:
import pandas as pd

In [None]:
person_data = pd.read_csv('person_data_es.csv.gz', index_col='uri')
person_data.head()

In [None]:
person_uris = set(person_data.index)
len(person_uris)

In [None]:
import bz2
import re

parts = re.compile('<(.+)> <(.+)> "(.*)"@es.*')

In [None]:
from itertools import islice

records = []

with bz2.open('long_abstracts_es.tql.bz2') as f:
    for line in islice(f, 100000):
        line = line.decode('utf-8')
        match = parts.match(line)
        if match:
            groups = match.groups()
            if groups[0] in person_uris:
                records.append({
                    'person_uri': groups[0],
                    'category': groups[1],
                    'text': groups[2]
                })
            
records = pd.DataFrame.from_records(records).set_index('person_uri').join(person_data, how='left')
print(len(records))
records.head()

In [None]:
records.gender.value_counts()

In [None]:
records.loc['http://es.dbpedia.org/resource/Albert_Einstein'].text

In [None]:
person_data.loc[:,('gender',)].values

In [None]:
from sklearn.preprocessing import LabelEncoder

vec = LabelEncoder()
labels = vec.fit_transform(records.loc[:,('gender',)])
labels

In [None]:
vec.classes_

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
text_matrix = tfidf.fit_transform(records['text'].values)

In [None]:
text_matrix

In [None]:
from sklearn.model_selection import train_test_split

X1, X2, y1, y2 = train_test_split(text_matrix, labels, random_state=0, train_size=0.5, stratify=labels)

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X1, y1)

In [None]:
y_pred = model.predict(X2)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y2, y_pred)

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y2, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, linewidth=1,
            xticklabels=vec.classes_, yticklabels=vec.classes_)
plt.xlabel('categoría real')
plt.ylabel('predicción');

Ahora hagamos lo mismo, pero entrenemos el modelo con una cantidad balanceada de clases (de manera ingenua):

In [None]:
records.gender.value_counts()

In [None]:
men = records[records.gender == 'male'].sample(2193)

balanced_records = pd.concat([men, records[records.gender == 'female']])
balanced_records.head()

In [None]:
text_matrix = tfidf.fit_transform(balanced_records['text'].values)

In [None]:
labels = vec.fit_transform(balanced_records.loc[:,('gender',)])

In [None]:
X1, X2, y1, y2 = train_test_split(text_matrix, labels, random_state=0, train_size=0.5, stratify=labels)

In [None]:
model.fit(X1, y1)
y_pred = model.predict(X2)
accuracy_score(y2, y_pred)

In [None]:
mat = confusion_matrix(y2, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, linewidth=1,
            xticklabels=vec.classes_, yticklabels=vec.classes_)
plt.xlabel('categoría real')
plt.ylabel('predicción');