## Aplicar as técnicas seguintes:

•	Random Forest (RF) 

•	ExtraTreesClassiﬁer (ETC)

•	XGBoost (XGB) com Early Stopping

•	Gaussian Naive Bayes


## Preparando os dados para treinar

In [1]:

import warnings
warnings.filterwarnings('ignore')

import itertools

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt


from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.naive_bayes import GaussianNB

from xgboost.sklearn import XGBClassifier

In [2]:

def plot_confusion_matrix(cm, classes=['Desistente', 'Não Desistente'],
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        pass
        # print('Confusion matrix, without normalization')

    #print(cm)

    # Plot the confusion matrix
    plt.figure(figsize = (5, 5))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 18)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 10)
    plt.yticks(tick_marks, classes, size = 10)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 15)
    plt.xlabel('Predicted label', size = 15)

# Confusion matrix
# cm = confusion_matrix(y_test, y_predict)
#plot_confusion_matrix(cm, classes = ['Poor Health', 'Good Health'],
#                      title = 'Health Confusion Matrix')
# plot_confusion_matrix(cm, classes = ['Desistente', 'Não Desistente'],
#                      title = 'Confusion Matrix')
# plt.savefig('cm.png')

## importando os dataframes para treino

In [3]:
df5 = pd.read_csv('/content/drive/Shared drives/iniciacao_cientifica_2019_2020/dataframes_treino/turma_102_2016_1/Dataframe5_Turma_102_2016_06_07_12_00_00_2016_08_25_11_55_00.csv', sep=';')

In [4]:
df_target_user = pd.read_csv('/content/drive/Shared drives/iniciacao_cientifica_2019_2020/csv_datasets/usuarios_classificados.csv')

## Criando um merge entre o df_target_user e df5

In [5]:
df_target_user = df_target_user[['semester_id', 'class_id', 'user_id', 'target']]

In [6]:
df5_data_target = pd.merge(df5, df_target_user, on=['semester_id', 'class_id', 'user_id'], how='inner')

In [7]:
df5_data_target.drop(['semester_id', 'class_id', 'user_id'], axis=1, inplace=True)

In [8]:
df5_data_target

Unnamed: 0,number_logins,average_active_time,average_exam_sub_comp,average_exam_sub_inc,average_exam_sub_err,average_exam_test,average_exam_test_err,average_exam_code_line,average_exam_sub_average_interval,average_homework_sub_comp,average_homework_sub_inc,average_homework_sub_err,average_homework_test,average_homework_test_err,average_homework_code_line,average_homework_test_average,target
0,58,160.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,44.0,158.0,Desistente
1,42,232.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.5,194.5,28.5,Desistente
2,50,763.66,1.167,9.0,1.667,36.167,9.333,1270.0,189.399,0.976,8.976,1.902,4.439,2.146,659.22,139.186,Não Desistente
3,10,168.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,46.0,0.0,Desistente
4,8,530.667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,0.333,0.417,0.0,0.0,53.25,0.0,Desistente
5,53,891.388,1.167,7.083,4.833,10.5,3.167,791.583,33.923,2.515,3.971,2.265,6.5,2.853,829.441,2469.251,Não Desistente
6,41,580.459,3.636,17.273,3.545,9.909,2.0,1582.091,32.108,0.921,0.968,1.0,3.73,1.937,214.317,27.491,Não Desistente
7,78,878.175,0.917,4.833,1.5,6.417,2.667,964.667,25.275,1.147,3.118,1.25,6.029,2.382,668.015,1749.474,Não Desistente
8,63,663.938,1.167,3.833,1.833,11.917,4.25,714.833,93.119,1.235,3.059,0.824,4.824,1.338,336.706,403.671,Não Desistente
9,128,761.525,2.917,0.75,0.667,5.917,0.5,319.583,282.117,1.471,2.868,0.706,8.485,1.765,522.029,1325.276,Não Desistente


## Preparando para treinar os dados até a primeira avaliação

In [9]:
# pegando as colunas de features
features = df5_data_target.columns.difference(['target'])

# dados de treino
X = df5_data_target[features].values

# classe target
y = df5_data_target['target'].values

## Usando cross validation

In [10]:
def string_2_int(row):
    if row == 'Desistente':
        return 1
    else:
        return 0

Y = []

for item in y:
    Y.append(string_2_int(item))
    # print(string_2_int(item))

print(Y)
Y = np.array(Y)
print(type(Y))


[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0]
<class 'numpy.ndarray'>


In [11]:
#scaler = MinMaxScaler(feature_range=(0, 1))
#X = scaler.fit_transform(X)

Y = y

scores = []
rf_scores = []
et_scores = []
gnb_scores = []
xgb_scores = []

# modelos
#best_svr = SVR(kernel='rbf')
classifier_rf = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=5, n_estimators=25, verbose=0)
classifier_et = ExtraTreesClassifier(bootstrap=False, criterion='entropy', max_depth=3, n_estimators=25)
classifier_gnb = GaussianNB()
classifier_xgb = XGBClassifier(learning_rate=0.1, max_depth=2, n_estimators=60)

cv = KFold(n_splits=10,  random_state=42, shuffle=True)
for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index)
    print("Test Index: ", test_index, "\n")

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]

    # criando um particao de validação para o modelo xgb
    X_val, y_val = X_test[0:2], y_test[0:2]
    _X_test = X_test[2:]
    _y_test = y_test[2:]


    # treinando os modelos
    # best_svr.fit(X_train, y_train)
    classifier_rf.fit(X_train, y_train)
    classifier_et.fit(X_train, y_train)
    classifier_gnb.fit(X_train, y_train)
    classifier_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_val, y_val)])

    # testando os modelos
    # a metrica usada é o r2
    # scores.append(best_svr.score(X_test, y_test))
    rf_scores.append(classifier_rf.score(X_test, y_test))
    et_scores.append(classifier_et.score(X_test, y_test))
    gnb_scores.append(classifier_gnb.score(X_test, y_test))
    xgb_scores.append(classifier_xgb.score(_X_test, _y_test))

print('Metrica usada r2')
# print('Média svm:', np.mean(scores))
print('Média Random Forest:', np.mean(rf_scores))
print('Média Extra Tree Classifier:', np.mean(et_scores))
print('Média XGBoost:', np.mean(xgb_scores))
print('Média Gaussian Naive Bayes:', np.mean(gnb_scores))

Train Index:  [ 0  1  2  3  4  6  7  8  9 10 11 13 14 15 16 17 18 20 21 22 23 24 25 26
 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 44 45 46 48 49 50 51 52]
Test Index:  [ 5 12 19 41 43 47] 

[0]	validation_0-error:0
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0
[2]	validation_0-error:0
[3]	validation_0-error:0
[4]	validation_0-error:0
[5]	validation_0-error:0
Stopping. Best iteration:
[0]	validation_0-error:0

Train Index:  [ 0  1  2  4  5  6  7  9 10 11 12 14 15 16 18 19 20 21 22 23 24 25 26 27
 28 29 30 31 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 51 52]
Test Index:  [ 3  8 13 17 32 50] 

[0]	validation_0-error:0
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0
[2]	validation_0-error:0
[3]	validation_0-error:0
[4]	validation_0-error:0
[5]	validation_0-error:0
Stopping. Best iteration:
[0]	validation_0-error:0

Train Index:  [ 0  1  2  3  5  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23

## Salvando os modelos

In [12]:
from joblib import dump, load

In [13]:
#dump(classifier_rf, '/content/drive/Shared drives/iniciacao_cientifica_2019_2020/modelos/2016_1_102_df1/classifier_rf.joblib')
#dump(classifier_et, '/content/drive/Shared drives/iniciacao_cientifica_2019_2020/modelos/2016_1_102_df1/classifier_et.joblib')
#dump(classifier_xgb, '/content/drive/Shared drives/iniciacao_cientifica_2019_2020/modelos/2016_1_102_df1/classifier_xgb.joblib')
#dump(classifier_gnb, '/content/drive/Shared drives/iniciacao_cientifica_2019_2020/modelos/2016_1_102_df1/classifier_gnb.joblib')