In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse
from sklearn_pandas import DataFrameMapper

In [None]:
data_dir = '../data/'

In [2]:
df = pd.read_csv('feat_eng_join_clean_train_aggressiveness_comentarios_facebook.csv')

# N-grams

In [3]:
mapper = DataFrameMapper([
    (['Longitud_Text', 'Numero_Palabras_Text', 'Numero_Palabras_Unicas'], None),
    ('Text',CountVectorizer(binary=False))
])
X=mapper.fit_transform(df)
y = df.Category

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123)

In [5]:
rf = RandomForestClassifier(
    class_weight="balanced",
    bootstrap=False,
    max_depth=None,
    max_features='log2',
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=2000,
)

In [6]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       max_features='log2', min_samples_split=5,
                       n_estimators=2000)

In [7]:
y_pred = rf.predict(X_test)

In [8]:
rf_f1_score = f1_score(y_pred, y_test, average='micro')

In [9]:
rf_f1_score

0.8303715670436186

In [10]:
confusion_matrix(y_test, y_pred)

array([[1344,   59,    6],
       [ 155,  185,    0],
       [  83,   12,   13]])

In [11]:
precision_score(y_test, y_pred, average='micro')

0.8303715670436187

In [12]:
recall_score(y_test, y_pred, average='micro')

0.8303715670436187

In [13]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 650, 1100, 1550, 2000], 'max_features': ['log2', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(class_weight="balanced")
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=3, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


In [None]:
rf_random.best_params_

In [None]:
rf_random.best_estimator_

In [None]:
rf = rf_random.best_estimator_
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring="f1_micro")
y_train_predict = cross_val_predict(rf, X_train, y_train, cv=3)
print("scores of Random forest is :" , scores)

In [None]:
confusion_matrix(y_train, y_train_predict)

In [None]:
precision_score(y_train, y_train_predict, average="micro")

In [None]:
recall_score(y_train, y_train_predict, average="micro")