## Aplicar as técnicas seguintes:

•	Random Forest (RF) 

•	ExtraTreesClassiﬁer (ETC)

•	XGBoost (XGB) com Early Stopping

•	Gaussian Naive Bayes


## Preparando os dados para treinar

In [1]:

import warnings
warnings.filterwarnings('ignore')


import itertools

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from pprint import pprint


from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
#from sklearn.metrics import plot_confusion_matrix
#from sklearn.metrics import precision_score


from sklearn.model_selection import KFold
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import cross_validate
#from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from xgboost.sklearn import XGBClassifier

from sklearn.naive_bayes import GaussianNB

In [2]:

def plot_confusion_matrix(cm, classes=['Desistente', 'Não Desistente'],
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        pass
        # print('Confusion matrix, without normalization')

    #print(cm)

    # Plot the confusion matrix
    plt.figure(figsize = (5, 5))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 18)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 10)
    plt.yticks(tick_marks, classes, size = 10)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 15)
    plt.xlabel('Predicted label', size = 15)

# Confusion matrix
# cm = confusion_matrix(y_test, y_predict)
#plot_confusion_matrix(cm, classes = ['Poor Health', 'Good Health'],
#                      title = 'Health Confusion Matrix')
# plot_confusion_matrix(cm, classes = ['Desistente', 'Não Desistente'],
#                      title = 'Confusion Matrix')
# plt.savefig('cm.png')

## importando os dataframes para treino

In [3]:
df3 = pd.read_csv('/content/drive/Shared drives/iniciacao_cientifica_2019_2020/dataframes_treino/turma_102_2016_1/Dataframe3_Turma_102_2016_06_07_12_00_00_2016_07_28_12_00_00.csv', sep=';')

In [4]:
df_target_user = pd.read_csv('/content/drive/Shared drives/iniciacao_cientifica_2019_2020/csv_datasets/usuarios_classificados.csv')

## Criando um merge entre o df_target_user e df3

In [5]:
df_target_user = df_target_user[['semester_id', 'class_id', 'user_id', 'target']]

In [6]:
df3_data_target = pd.merge(df3, df_target_user, on=['semester_id', 'class_id', 'user_id'], how='inner')

In [7]:
df3_data_target.drop(['semester_id', 'class_id', 'user_id'], axis=1, inplace=True)

## Criando uma nova coluna com status

In [8]:
columns = [
    'number_logins',
    'average_active_time',
    'average_exam_sub_comp',
    'average_exam_sub_inc',
    'average_exam_sub_err',
    'average_exam_test',
    'average_exam_test_err',
    'average_exam_code_line',
    'average_exam_sub_average_interval',
    'average_homework_sub_comp',
    'average_homework_sub_inc',
    'average_homework_sub_err',
    'average_homework_test',
    'average_homework_test_err',
    'average_homework_code_line',
    'average_homework_test_average'
]

def status_user(row):
    count = 0
    # x = 0.0
    # for v in row:
    #    if v > x:
    #        count += 1
    #if count <= 0:
    #    pass #print(row)
    if np.sum(row) <= 0:
        return 'Inativo'
    else:
        return 'Ativo'
    
df3_data_target_status = df3_data_target.copy()
df3_data_target_status['status'] = df3_data_target_status[columns].apply(lambda row: status_user(row), axis=1)

## Preparando para treinar os dados até a primeira avaliação

In [9]:
# pegando as colunas de features
features = df3_data_target.columns.difference(['target'])

# dados de treino
X = df3_data_target[features].values

# classe target
Y = df3_data_target['target'].values

In [10]:
dict_class = {}
dict_class['target'] = Y

# definindo um df para armazenar as predições
df_all_predictions = pd.DataFrame(dict_class)

df_all_predictions['status'] = df3_data_target_status['status']

## Usando cross validation

In [11]:
acur = []
dict_predict = {}

# modelos
classifier_rf = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=5, n_estimators=25, verbose=0)

# cv = KFold(n_splits=10,  random_state=2**32 - 1, shuffle=True)
cv = KFold(n_splits=10, shuffle=True)
for train_index, test_index in cv.split(X):
    # print("Train Index: ", train_index)
    # print("Test Index: ", test_index, "\n")

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]


    # treinando os modelos
    classifier_rf.fit(X_train, y_train)

    # testando os modelos
    y_predicted = classifier_rf.predict(X_test)

    #print('y_true=',y_test)
    #print('y_pred=',y_predicted)
    #print('y_index=',test_index)
    d = dict(zip(test_index, y_predicted))
    dict_predict.update(d)

    metricas = classification_report(y_test, y_predicted)
    acur.append(accuracy_score(y_test, y_predicted))
    #recal = metrics.recall_score(y_test,y_predicted)
    # print(acur)
    #print(metricas)
    # a metrica usada é o r2
    # scores.append(best_svr.score(X_test, y_test))
    # rf_scores.append(classifier_rf.score(X_test, y_test))
    # plot_confusion_matrix(classifier_rf, X_test, y_test)

print('Acurácia Média Random Forest:', np.mean(acur))


l = dict(sorted(dict_predict.items()))
a = [w for v,w in l.items()]
a = np.array(a)
df_all_predictions['prediction_rf'] = a


Acurácia Média Random Forest: 0.9400000000000001


In [12]:
acur = []
dict_predict = {}

# modelos
classifier_etc = ExtraTreesClassifier(bootstrap=False, criterion='entropy', max_depth=3, n_estimators=25)

# cv = KFold(n_splits=10,  random_state=2**32 - 1, shuffle=True)
cv = KFold(n_splits=10, shuffle=True)
for train_index, test_index in cv.split(X):
    # print("Train Index: ", train_index)
    # print("Test Index: ", test_index, "\n")

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]


    # treinando os modelos
    classifier_etc.fit(X_train, y_train)

    # testando os modelos
    y_predicted = classifier_etc.predict(X_test)

    #print('y_true=',y_test)
    #print('y_pred=',y_predicted)
    #print('y_index=',test_index)
    d = dict(zip(test_index, y_predicted))
    dict_predict.update(d)

    metricas = classification_report(y_test, y_predicted)
    acur.append(accuracy_score(y_test, y_predicted))
    #recal = metrics.recall_score(y_test,y_predicted)
    # print(acur)
    #print(metricas)
    # a metrica usada é o r2
    # scores.append(best_svr.score(X_test, y_test))
    # rf_scores.append(classifier_rf.score(X_test, y_test))
    # plot_confusion_matrix(classifier_rf, X_test, y_test)

print('Acurácia Média Extra Tree Classifier:', np.mean(acur))


l = dict(sorted(dict_predict.items()))
a = [w for v,w in l.items()]
a = np.array(a)
df_all_predictions['prediction_etc'] = a


Acurácia Média Extra Tree Classifier: 0.9400000000000001


In [13]:
acur = []
dict_predict = {}

# modelos
classifier_xgb = XGBClassifier(learning_rate=0.1, max_depth=2, n_estimators=60)

# cv = KFold(n_splits=10,  random_state=2**32 - 1, shuffle=True)
cv = KFold(n_splits=10, shuffle=True)
for train_index, test_index in cv.split(X):
    # print("Train Index: ", train_index)
    # print("Test Index: ", test_index, "\n")

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]

    # criando um particao de validação para o modelo xgb
    X_val, y_val = X_test[0:2], y_test[0:2]
    _X_test = X_test[2:]
    _y_test = y_test[2:]


    # treinando os modelos
    classifier_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_val, y_val)])

    # testando os modelos
    y_predicted = classifier_xgb.predict(X_test)

    #print('y_true=',y_test)
    #print('y_pred=',y_predicted)
    #print('y_index=',test_index)
    d = dict(zip(test_index, y_predicted))
    dict_predict.update(d)

    metricas = classification_report(y_test, y_predicted)
    acur.append(accuracy_score(y_test, y_predicted))
    #recal = metrics.recall_score(y_test,y_predicted)
    # print(acur)
    #print(metricas)
    # a metrica usada é o r2
    # scores.append(best_svr.score(X_test, y_test))
    # rf_scores.append(classifier_rf.score(X_test, y_test))
    # plot_confusion_matrix(classifier_rf, X_test, y_test)

print('Acurácia Média XGBoost', np.mean(acur))

l = dict(sorted(dict_predict.items()))
a = [w for v,w in l.items()]
a = np.array(a)
df_all_predictions['prediction_xgb'] = a


[0]	validation_0-error:0.5
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.5
[2]	validation_0-error:0.5
[3]	validation_0-error:0.5
[4]	validation_0-error:0.5
[5]	validation_0-error:0.5
[6]	validation_0-error:0.5
[7]	validation_0-error:0.5
[8]	validation_0-error:0.5
[9]	validation_0-error:0.5
[10]	validation_0-error:0.5
Stopping. Best iteration:
[0]	validation_0-error:0.5

[0]	validation_0-error:0
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0
[2]	validation_0-error:0
[3]	validation_0-error:0
[4]	validation_0-error:0
[5]	validation_0-error:0
[6]	validation_0-error:0
[7]	validation_0-error:0
[8]	validation_0-error:0
[9]	validation_0-error:0
[10]	validation_0-error:0
Stopping. Best iteration:
[0]	validation_0-error:0

[0]	validation_0-error:0
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0
[2]	validation_0-error:0
[3]	validation_0-error:0
[4]	validation_0-error

In [14]:
acur = []
dict_predict = {}

# modelos
classifier_gnb = GaussianNB()

# cv = KFold(n_splits=10,  random_state=2**32 - 1, shuffle=True)
cv = KFold(n_splits=10, shuffle=True)
for train_index, test_index in cv.split(X):
    # print("Train Index: ", train_index)
    # print("Test Index: ", test_index, "\n")

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]

    # treinando os modelos
    classifier_gnb.fit(X_train, y_train)

    # testando os modelos
    y_predicted = classifier_gnb.predict(X_test)

    #print('y_true=',y_test)
    #print('y_pred=',y_predicted)
    #print('y_index=',test_index)
    d = dict(zip(test_index, y_predicted))
    dict_predict.update(d)

    metricas = classification_report(y_test, y_predicted)
    acur.append(accuracy_score(y_test, y_predicted))
    #recal = metrics.recall_score(y_test,y_predicted)
    # print(acur)
    #print(metricas)
    # a metrica usada é o r2
    # scores.append(best_svr.score(X_test, y_test))
    # rf_scores.append(classifier_rf.score(X_test, y_test))
    # plot_confusion_matrix(classifier_rf, X_test, y_test)

print('Acurácia Média Gaussian Naive Bayes', np.mean(acur))

l = dict(sorted(dict_predict.items()))
a = [w for v,w in l.items()]
a = np.array(a)
df_all_predictions['prediction_gnb'] = a


Acurácia Média Gaussian Naive Bayes 0.9100000000000001


In [15]:
df_all_predictions

Unnamed: 0,target,status,prediction_rf,prediction_etc,prediction_xgb,prediction_gnb
0,Desistente,Ativo,Desistente,Desistente,Desistente,Desistente
1,Desistente,Ativo,Desistente,Desistente,Desistente,Desistente
2,Não Desistente,Ativo,Não Desistente,Não Desistente,Não Desistente,Não Desistente
3,Desistente,Ativo,Desistente,Desistente,Desistente,Desistente
4,Desistente,Ativo,Desistente,Desistente,Desistente,Desistente
5,Não Desistente,Ativo,Não Desistente,Não Desistente,Não Desistente,Não Desistente
6,Não Desistente,Ativo,Não Desistente,Não Desistente,Não Desistente,Não Desistente
7,Não Desistente,Ativo,Não Desistente,Não Desistente,Não Desistente,Não Desistente
8,Não Desistente,Ativo,Não Desistente,Não Desistente,Não Desistente,Não Desistente
9,Não Desistente,Ativo,Não Desistente,Não Desistente,Não Desistente,Não Desistente


## Contando os valores reais de preditos

In [16]:
def count_predicts_values(mod_predict):
    tp = df_all_predictions[(df_all_predictions[mod_predict] == 'Desistente') & 
                            (df_all_predictions['target'] == 'Desistente')].shape[0]
    tn = df_all_predictions[(df_all_predictions[mod_predict] == 'Não Desistente') &
                            (df_all_predictions['target'] == 'Não Desistente')].shape[0]
    fp = df_all_predictions[(df_all_predictions[mod_predict] == 'Desistente') &
                            (df_all_predictions['target'] == 'Não Desistente')].shape[0]
    fn = df_all_predictions[(df_all_predictions[mod_predict] == 'Não Desistente') & 
                            (df_all_predictions['target'] == 'Desistente')].shape[0]
    v = f'ND ND {tn}\nND D  {fp}\nD  ND {fn}\nD  D  {tp}\n'
    # return v
    return tn, fp, fn, tp

## Contando os alunos quanto as atividades

In [17]:
def active_report(mod_predict):
    asd_real = df_all_predictions.loc[(df_all_predictions['target'] == 'Desistente') &
                        (df_all_predictions['status'] == 'Inativo')].shape[0]

    asnd_real = df_all_predictions.loc[(df_all_predictions['target'] == 'Não Desistente') &
                        (df_all_predictions['status'] == 'Inativo')].shape[0]

    asd_predict = df_all_predictions.loc[(df_all_predictions[mod_predict] == 'Desistente') &
                        (df_all_predictions['status'] == 'Inativo')].shape[0]

    asnd_predict = df_all_predictions.loc[(df_all_predictions[mod_predict] == 'Não Desistente') &
                        (df_all_predictions['status'] == 'Inativo')].shape[0]               
    v = f'ASD-real     {asd_real}\nASND-real    {asnd_real}\nASD-predict  {asd_predict}\nASND-predict {asnd_predict}\n'
    # return v
    return asd_real, asnd_real, asd_predict, asnd_predict

## Micrométricas

In [18]:
print('Random Forest')
print(count_predicts_values('prediction_rf'))
print(active_report('prediction_rf'))
print('Extra Tree Classifier')
print(count_predicts_values('prediction_etc'))
print(active_report('prediction_etc'))
print('XGBoost')
print(count_predicts_values('prediction_xgb'))
print(active_report('prediction_xgb'))
print('Gaussian Naive Bayes')
print(count_predicts_values('prediction_gnb'))
print(active_report('prediction_gnb'))

Random Forest
(42, 0, 3, 8)
(2, 0, 2, 0)
Extra Tree Classifier
(42, 0, 3, 8)
(2, 0, 2, 0)
XGBoost
(37, 5, 3, 8)
(2, 0, 2, 0)
Gaussian Naive Bayes
(40, 2, 3, 8)
(2, 0, 2, 0)


In [19]:
import itertools
col = ['model', 'nd_nd', 'nd_d', 'd_nd', 'd_d', 'asd_real', 'asnd_real', 'asd_predict', 'asnd_predict']
micro_metrics = pd.DataFrame(columns=[col])
list2d =[['random_forest'], list(count_predicts_values('prediction_rf')), list(active_report('prediction_rf'))]
merged = list(itertools.chain.from_iterable(list2d))
micro_metrics.loc[0] = merged
list2d =[['extra_tree_classifier'], list(count_predicts_values('prediction_etc')), list(active_report('prediction_etc'))]
merged = list(itertools.chain.from_iterable(list2d))
micro_metrics.loc[1] = merged
list2d =[['xgboost'], list(count_predicts_values('prediction_xgb')), list(active_report('prediction_xgb'))]
merged = list(itertools.chain.from_iterable(list2d))
micro_metrics.loc[2] = merged
list2d =[['gaussian_naive_bayes'], list(count_predicts_values('prediction_gnb')), list(active_report('prediction_gnb'))]
merged = list(itertools.chain.from_iterable(list2d))
micro_metrics.loc[3] = merged


micro_metrics.to_csv('/content/drive/Shared drives/iniciacao_cientifica_2019_2020/dataframes_experimento/all_metrics/df3.csv', index=False)


## Exportando as predições

In [21]:
df_all_predictions.to_csv('/content/drive/Shared drives/iniciacao_cientifica_2019_2020/dataframes_experimento/all_preditions/df3_all_preditions.csv', index=False)