In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
data_dir = '../data/'

In [3]:
df = pd.read_csv(data_dir + 'join_clean_train_aggressiveness_comentarios_facebook.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9282 entries, 0 to 9281
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      9282 non-null   object
 1   Category  9282 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 145.2+ KB


In [5]:
#Bag of words
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(df['Text'])
y = df.Category

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123)

## Logistic Regression

In [7]:
lr = LogisticRegression(solver = 'lbfgs', max_iter=200)

In [8]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
#Obtenemos las predicciones para X_test
y_pred = lr.predict(X_test)

In [10]:
lr_f1_score = f1_score(y_pred, y_test, average='micro')

In [11]:
lr_f1_score

0.8330640818524501

In [12]:
confusion_matrix(y_test, y_pred)

array([[1309,   74,   26],
       [ 132,  208,    0],
       [  73,    5,   30]])

In [13]:
precision_score(y_test, y_pred, average='micro')

0.8330640818524502

In [14]:
recall_score(y_test, y_pred, average='micro')

0.8330640818524502

## Naive Bayes

In [15]:
nb = GaussianNB()

In [16]:
nb.fit(X_train.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
#Obtenemos las predicciones para X_test
y_pred = nb.predict(X_test.toarray())

In [18]:
nb_f1_score = f1_score(y_pred, y_test, average='micro')

In [19]:
nb_f1_score

0.5164243403338719

In [20]:
confusion_matrix(y_test, y_pred)

array([[786, 316, 307],
       [124, 142,  74],
       [ 70,   7,  31]])

In [21]:
precision_score(y_test, y_pred, average='micro')

0.5164243403338719

In [22]:
recall_score(y_test, y_pred, average='micro')

0.5164243403338719

## Random Forest

In [23]:
rf = RandomForestClassifier()

In [25]:
rf.fit(X_train.toarray(), y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
rf_f1_score = f1_score(y_pred, y_test, average='micro')

In [27]:
rf_f1_score

0.5164243403338719

In [28]:
confusion_matrix(y_test, y_pred)

array([[786, 316, 307],
       [124, 142,  74],
       [ 70,   7,  31]])

In [29]:
precision_score(y_test, y_pred, average='micro')

0.5164243403338719

In [30]:
recall_score(y_test, y_pred, average='micro')

0.5164243403338719