## Aplicar as técnicas seguintes:

•	Random Forest (RF) 

•	ExtraTreesClassiﬁer (ETC)

•	XGBoost (XGB) com Early Stopping

•	Gaussian Naive Bayes


## Preparando os dados para treinar

In [22]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.naive_bayes import GaussianNB

from xgboost.sklearn import XGBClassifier

## importando os dataframes para treino

In [23]:
df1 = pd.read_csv('/content/drive/Shared drives/iniciacao_cientifica_2019_2020/dataframes_treino/turma_102_2016_1/Dataframe1_Turma_102_2016_06_07_12_00_00_2016_06_30_11_55_00.csv', sep=';')

In [24]:
df_target_user = pd.read_csv('/content/drive/Shared drives/iniciacao_cientifica_2019_2020/csv_datasets/usuarios_classificados.csv')

## Criando um merge entre o df_target_user e df1

In [25]:
df_target_user = df_target_user[['semester_id', 'class_id', 'user_id', 'target']]

In [26]:
df1_data_target = pd.merge(df1, df_target_user, on=['semester_id', 'class_id', 'user_id'], how='inner')

In [27]:
df1_data_target.drop(['semester_id', 'class_id', 'user_id'], axis=1, inplace=True)

In [28]:
df1_data_target

Unnamed: 0,number_logins,average_active_time,average_exam_sub_comp,average_exam_sub_inc,average_exam_sub_err,average_exam_test,average_exam_test_err,average_exam_code_line,average_exam_sub_average_interval,average_homework_sub_comp,average_homework_sub_inc,average_homework_sub_err,average_homework_test,average_homework_test_err,average_homework_code_line,average_homework_test_average,target
0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,Desistente
1,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,194.5,0.0,Desistente
2,21,257.255,0.5,3.167,0.333,10.667,0.833,1270.0,84.333,0.463,2.122,0.293,0.0,0.0,659.22,0.0,Não Desistente
3,8,168.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,46.0,0.0,Desistente
4,8,530.667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,0.333,0.417,0.0,0.0,53.25,0.0,Desistente
5,13,189.463,0.333,3.75,0.25,2.083,0.583,791.583,2.847,0.559,0.632,0.485,0.456,0.25,829.441,3.1,Não Desistente
6,13,174.973,0.091,4.0,0.182,2.545,0.091,1582.091,1.515,0.302,0.238,0.095,0.857,0.556,214.317,8.211,Não Desistente
7,19,165.812,0.25,0.083,0.0,0.417,0.0,964.667,4.333,0.441,0.794,0.515,0.103,0.0,668.015,950.618,Não Desistente
8,21,188.338,0.167,0.917,0.167,1.417,0.333,714.833,9.495,0.456,1.544,0.279,0.412,0.059,336.706,2.842,Não Desistente
9,28,169.225,1.417,0.0,0.333,1.083,0.25,319.583,31.776,0.397,0.397,0.074,0.015,0.0,522.029,0.0,Não Desistente


## Preparando para treinar os dados até a primeira avaliação

In [29]:
# pegando as colunas de features
features = df1_data_target.columns.difference(['target'])

# dados de treino
X = df1_data_target[features].values

# classe target
y = df1_data_target['target'].values

## Dividindo uma parte para teste e outra para treino

In [30]:
# dividindo uma parte dos dados para treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
# verificando a forma dos dados de treino
X_train.shape, y_train.shape

((35, 16), (35,))

In [32]:
# verificando a forma dos dados de teste
X_test.shape, y_test.shape

((18, 16), (18,))

## Utilizando um GridSearchCV e RandomForestClassifier para encontrar os melhores parâmetros

In [33]:
# criando uma instância de RFC
classifier_rf = RandomForestClassifier()

# parâmetros para o gridsearchcv
param_grid_ = {
    "criterion": ['entropy', 'gini'],
    "n_estimators": [25, 50, 75],
    "bootstrap": [False, True],
    "max_depth": [3, 5, 10]
} 

# criando uma instância do gridsearchcv
grid_search = GridSearchCV(
    estimator=classifier_rf, 
    param_grid=param_grid_, 
    scoring="accuracy", 
    cv=5
)

# treinando com o gridsearch
grid_search.fit(X_train, y_train)

# atrinbuindo o modelo
classifier_rf = grid_search.best_estimator_ 

print(grid_search.best_params_, grid_search.best_score_)

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 25} 0.9428571428571428


In [34]:
# classificando o conjunto de teste
y_predict = grid_search.predict(X_test)

# metricas de avaliação de desempenho
print(metrics.classification_report(y_test, y_predict))

# matrix de confusão
print(metrics.confusion_matrix(y_test, y_predict))

                precision    recall  f1-score   support

    Desistente       1.00      1.00      1.00         5
Não Desistente       1.00      1.00      1.00        13

      accuracy                           1.00        18
     macro avg       1.00      1.00      1.00        18
  weighted avg       1.00      1.00      1.00        18

[[ 5  0]
 [ 0 13]]


## Encontrando os melhores parâmetros com GridSearch o ExtraTreesClassifier

In [35]:
# criando uma instância de ETC
classifier_et = ExtraTreesClassifier()

# parametros para o gridsearchcv
param_grid_ = {
    "criterion": ['entropy', 'gini'],
    "n_estimators": [25, 50, 75],
    "bootstrap": [False, True],
    "max_depth": [3, 5, 10]
} 

# criando uma instância de gridsearchcv
grid_search = GridSearchCV(
    estimator=classifier_et, 
    param_grid = param_grid_, 
    scoring="accuracy", 
    cv=5
)

# treinando com os dados de treino
grid_search.fit(X_train, y_train)

# atribuindo a instância os melhores parâmetros
classifier_et = grid_search.best_estimator_ 

print(grid_search.best_params_, grid_search.best_score_)

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 25} 0.9428571428571428


In [36]:
# classificando o conjunto de teste
y_predict = grid_search.predict(X_test)

# metricas de avaliação de desempenho
print(metrics.classification_report(y_test, y_predict))

# matrix de confusão
print(metrics.confusion_matrix(y_test, y_predict))

                precision    recall  f1-score   support

    Desistente       0.75      0.60      0.67         5
Não Desistente       0.86      0.92      0.89        13

      accuracy                           0.83        18
     macro avg       0.80      0.76      0.78        18
  weighted avg       0.83      0.83      0.83        18

[[ 3  2]
 [ 1 12]]


## XGB e GridSearchCV para encontrar os melhores parâmetros

### Parâmetros
**num_boost_rounds:** corresponde ao número de rodadas de aumento ou árvores a serem construídas.

**early_stopping_round:** verifica se nas N primeiras rodadas o algoritmo melhora, caso não melhore ele para e fica com a melhor rodada. É nessário usar o **eval_set**

**eval_set:** (list, opcional) - Uma lista de pares de tuplas (X, y) a serem usados ​​como conjuntos de validação, para os quais as métricas serão calculadas. As métricas de validação nos ajudarão a acompanhar o desempenho do modelo.

**max_depth:** é o número máximo de nós permitido da raiz até a folha mais distante de uma árvore.

**min_child_weight:** é o peso mínimo (ou número de amostras, se todas as amostras tiverem um peso de 1) necessário para criar um novo nó na árvore.

**subsample:** corresponde à fração de observações (as linhas) para subamostra em cada etapa. Por padrão, é definido como 1, o que significa que usamos todas as linhas.

**colsample_bytree:** corresponde à fração de recursos (as colunas) a serem usadas. Por padrão, é definido como 1, o que significa que usaremos todos os recursos.

**ETA:** controla a taxa de aprendizado. (com um eta mais baixo, precisamos de mais rodadas de reforço)

In [37]:
# instanciando o xgbclassifier com algumas características
classifier_xgb = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

# setando os parâmetros usados no grid
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

# setando os parâmetros do GridSearchCV
grid_search = GridSearchCV(
    estimator=classifier_xgb,
    param_grid=parameters,
    scoring = 'accuracy',
    n_jobs = 10,
    cv = 10,
    # verbose=True
)

fit_params={
    'early_stopping_rounds':10,
    'eval_set': [(X_test, y_test)],
    'verbose':True
}

# treinando com o melhor estimator
grid_search.fit(
    X_train, 
    y_train,
    **fit_params
)

# atribuindo a instância os melhores parâmetros
classifier_xgb = grid_search.best_estimator_ 

print(grid_search.best_params_, grid_search.best_score_)

[0]	validation_0-error:0.055556
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.055556
[2]	validation_0-error:0.055556
[3]	validation_0-error:0.055556
[4]	validation_0-error:0.055556
[5]	validation_0-error:0.055556
[6]	validation_0-error:0.055556
[7]	validation_0-error:0.055556
[8]	validation_0-error:0.055556
[9]	validation_0-error:0.055556
[10]	validation_0-error:0.055556
Stopping. Best iteration:
[0]	validation_0-error:0.055556

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 60} 0.8666666666666666


In [38]:
# classificando o conjunto de teste
y_predict = grid_search.predict(X_test)

# metricas de avaliação de desempenho
print(metrics.classification_report(y_test, y_predict))

# matrix de confusão
print(metrics.confusion_matrix(y_test, y_predict))

                precision    recall  f1-score   support

    Desistente       1.00      0.80      0.89         5
Não Desistente       0.93      1.00      0.96        13

      accuracy                           0.94        18
     macro avg       0.96      0.90      0.93        18
  weighted avg       0.95      0.94      0.94        18

[[ 4  1]
 [ 0 13]]


## Aplicando *Gaussian Naive Bayes* no conjunto de treinamento

In [39]:
# instânciando um GaussianNB
classifier_gnb = GaussianNB()

# treinando o modelo
classifier_gnb.fit(X_train, y_train)


GaussianNB(priors=None, var_smoothing=1e-09)

In [40]:
# classificando o conjunto de teste
y_predict = classifier_gnb.predict(X_test)

# metricas de avaliação de desempenho
print(metrics.classification_report(y_test, y_predict))

# matrix de confusão
print(metrics.confusion_matrix(y_test, y_predict))

                precision    recall  f1-score   support

    Desistente       1.00      0.80      0.89         5
Não Desistente       0.93      1.00      0.96        13

      accuracy                           0.94        18
     macro avg       0.96      0.90      0.93        18
  weighted avg       0.95      0.94      0.94        18

[[ 4  1]
 [ 0 13]]


## Salvando os modelos

In [41]:
from joblib import dump, load

In [42]:
#dump(classifier_rf, '/content/drive/Shared drives/iniciacao_cientifica_2019_2020/modelos/2016_1_102_df1/classifier_rf.joblib')
#dump(classifier_et, '/content/drive/Shared drives/iniciacao_cientifica_2019_2020/modelos/2016_1_102_df1/classifier_et.joblib')
#dump(classifier_xgb, '/content/drive/Shared drives/iniciacao_cientifica_2019_2020/modelos/2016_1_102_df1/classifier_xgb.joblib')
#dump(classifier_gnb, '/content/drive/Shared drives/iniciacao_cientifica_2019_2020/modelos/2016_1_102_df1/classifier_gnb.joblib')