In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from scipy import sparse
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=999)

In [None]:
data_dir = '../data/'

In [None]:
df = pd.read_csv(data_dir + 'feat_eng_join_clean_train_aggressiveness_comentarios_facebook.csv')

In [None]:
df.info()

## Entrenamiento de Naive Bayes despues de Feature Engineering

In [None]:
#Bag of words
count_vectorizer = CountVectorizer()
X_vec = count_vectorizer.fit_transform(df['Text'])
vocab = count_vectorizer.get_feature_names()

X = pd.DataFrame(X_vec.toarray(), columns = vocab)
y = df.Category

In [None]:
X['Longitud_Text'] = df['Longitud_Text']
X['Numero_Palabras_Text'] = df['Numero_Palabras_Text']
X['Numero_Palabras_Unicas'] = df['Numero_Palabras_Unicas']

In [None]:
X_sparse = sparse.csr_matrix(X.values)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y, 
                                                    test_size=0.2, 
                                                    random_state=123)

In [None]:
nb = GaussianNB()

In [None]:
nb.fit(X_train.toarray(), y_train)

In [None]:
y_pred = nb.predict(X_test.toarray())

In [None]:
nb_f1_score = f1_score(y_pred, y_test, average='micro')

In [None]:
nb_f1_score

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred, average='micro')

In [None]:
recall_score(y_test, y_pred, average='micro')

## Grid Search

In [None]:
params_nb = {'var_smoothing': np.logspace(0,-9, num=10)}

In [None]:
gs_nb = GridSearchCV(estimator = nb, 
                     param_grid = params_nb, 
                     cv = cv_method,
                     verbose = 1, 
                     scoring="f1_micro")

In [None]:
gs_nb.fit(X_train.toarray(), y_train)

In [None]:
gs_nb.best_params_

In [None]:
gs_nb.best_score_