In [1]:

# Data
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import  train_test_split
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# Classifiers
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

## Ensembles
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import auc, roc_curve, RocCurveDisplay
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

# Seleção de Feature 
from sklearn.feature_selection import SelectFromModel


In [2]:
SEED = 265123

In [3]:
df = pd.read_csv ('data_processed.csv')

In [4]:
X = df.drop(['Abandono_curso'], axis=1)
y = df['Abandono_curso']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=SEED)

In [6]:
from sklearn.utils.extmath import randomized_range_finder
models = {}
# decision tree classifier
models.update( {"dtc":DecisionTreeClassifier(random_state=SEED)} )
# Gaussian Naive Bayes - testar
models.update( {"gnb":GaussianNB()} )
# k-nearest neighbors
models.update( {"knn":KNeighborsClassifier()} )
# Support Vector Classification Linear
models.update( {"svcl":LinearSVC(random_state=SEED)} )
# Support Vector Classification Polynomial
models.update( {"svcp":SVC(kernel="poly", random_state=SEED, max_iter=3000)} )
# Support Vector Classification Rbf
models.update( {"svcr":SVC(kernel="rbf", random_state=SEED, max_iter=3000)} )
# Mini-Batch K-Means
#models.update( {"mbkmeans":MiniBatchKMeans(random_state=SEED)} )
# random forest
models.update( {"rf":RandomForestClassifier(random_state=SEED)} )
# Stochastic Gradient Descent Classifier
models.update( {"sgd":SGDClassifier(random_state=SEED)} )
# Logistic Regression
models.update( {"lr":LogisticRegression(random_state=SEED, max_iter=1000)} )
# Multilayer Perceptron
models.update( {"mlp":MLPClassifier(random_state=SEED, max_iter=1000)} ) 
# Extreme Gradiente Boost
models.update( {"xgb":XGBClassifier()} )

In [7]:
trained_models = {}
for cls in models.keys():
  print(f"Training model: {cls}")
  trained_models.update({cls:models[cls].fit(X_train, y_train)})

Training model: dtc
Training model: gnb
Training model: knn
Training model: svcl
Training model: svcp




Training model: svcr
Training model: rf
Training model: sgd
Training model: lr
Training model: mlp
Training model: xgb


In [8]:
predictions = {}
for cls in trained_models.keys():
  print(f"Retrieving predictions from model: {cls}")
  predictions.update({cls:trained_models[cls].predict(X_test)})

Retrieving predictions from model: dtc
Retrieving predictions from model: gnb
Retrieving predictions from model: knn
Retrieving predictions from model: svcl
Retrieving predictions from model: svcp
Retrieving predictions from model: svcr
Retrieving predictions from model: rf
Retrieving predictions from model: sgd
Retrieving predictions from model: lr
Retrieving predictions from model: mlp
Retrieving predictions from model: xgb


In [9]:
accuracies = {}
class_rep = {}
conf_matr = {}

for pred in predictions.keys():
  # accuracy scores
  accuracies.update( { pred: accuracy_score(y_test, predictions[pred]) } )
  # classification reports
  class_rep.update( { pred: classification_report(y_test, predictions[pred]) } )
  # confusion matrixes
  conf_matr.update( { pred: confusion_matrix(y_test, predictions[pred]) } )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
accuracies

{'dtc': 0.6759956942949408,
 'gnb': 0.759956942949408,
 'knn': 0.6663078579117331,
 'svcl': 0.3573735199138859,
 'svcp': 0.7158234660925726,
 'svcr': 0.7158234660925726,
 'rf': 0.7631862217438106,
 'sgd': 0.28417653390742736,
 'lr': 0.7771797631862217,
 'mlp': 0.7631862217438106,
 'xgb': 0.7416576964477933}

In [11]:
for cf in class_rep.keys():
  print('------------------------------------------')
  print(f"Model: {cf}")
  print(class_rep[cf])

------------------------------------------
Model: dtc
              precision    recall  f1-score   support

           0       0.43      0.45      0.44       264
           1       0.78      0.77      0.77       665

    accuracy                           0.68       929
   macro avg       0.60      0.61      0.61       929
weighted avg       0.68      0.68      0.68       929

------------------------------------------
Model: gnb
              precision    recall  f1-score   support

           0       0.68      0.30      0.41       264
           1       0.77      0.94      0.85       665

    accuracy                           0.76       929
   macro avg       0.72      0.62      0.63       929
weighted avg       0.75      0.76      0.72       929

------------------------------------------
Model: knn
              precision    recall  f1-score   support

           0       0.32      0.16      0.21       264
           1       0.72      0.87      0.79       665

    accuracy        

In [12]:
cv_models = {}
# decision tree classifier
cv_models.update( {"dtc":DecisionTreeClassifier(random_state=SEED)} )
# Gaussian Naive Bayes - testar
cv_models.update( {"gnb":GaussianNB()} )
# k-nearest neighbors
cv_models.update( {"knn":KNeighborsClassifier()} )
# Support Vector Classification Linear
cv_models.update( {"svcl":LinearSVC(random_state=SEED, max_iter=3000)} )
# Support Vector Classification Polynomial
cv_models.update( {"svcp":SVC(kernel="poly", random_state=SEED, max_iter=3000)} )
# Support Vector Classification Rbf
cv_models.update( {"svcr":SVC(kernel="rbf", random_state=SEED, max_iter=3000)} )
# Mini-Batch K-Means
#cv_models.update( {"mbkmeans":MiniBatchKMeans(random_state=SEED)} )
# random forest
cv_models.update( {"rf":RandomForestClassifier(random_state=SEED)} )
# Stochastic Gradient Descent Classifier
cv_models.update( {"sgd":SGDClassifier(random_state=SEED, max_iter=3000)} )
# Logistic Regression
cv_models.update( {"lr":LogisticRegression(random_state=SEED, max_iter=1000)} )
# Multilayer Perceptron
cv_models.update( {"mlp":MLPClassifier(random_state=SEED, max_iter=1000)} ) 
# Extreme Gradiente Boost
cv_models.update( {"xgb":XGBClassifier()} )

In [13]:
import numpy as np

In [14]:
cv_scores = {}

for cls in cv_models.keys():
  print(f"Training cross validation of model: {cls}")
  score = np.mean(cross_val_score(cv_models[cls], X_train, y_train))
  cv_scores.update( {cls: score} )

Training cross validation of model: dtc
Training cross validation of model: gnb
Training cross validation of model: knn
Training cross validation of model: svcl




Training cross validation of model: svcp
Training cross validation of model: svcr
Training cross validation of model: rf
Training cross validation of model: sgd
Training cross validation of model: lr
Training cross validation of model: mlp
Training cross validation of model: xgb


In [15]:
cv_scores

{'dtc': 0.6569815254531766,
 'gnb': 0.7527834082997111,
 'knn': 0.6734754155325188,
 'svcl': 0.4804585497062476,
 'svcp': 0.7215645772604132,
 'svcr': 0.7215645772604132,
 'rf': 0.7649742926455733,
 'sgd': 0.7215645772604132,
 'lr': 0.7743106632433093,
 'mlp': 0.6741588000231655,
 'xgb': 0.7308880780937306}

In [16]:
mlp = MLPClassifier()

In [17]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'alpha': np.logspace(-4, 0, num=5),  # Varia de 0.0001 a 1
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [1000],
}

In [18]:
random_search = RandomizedSearchCV(
    estimator=mlp,
    param_distributions=param_grid,
    n_iter=200,
    n_jobs=-1,
    cv=5,
    verbose=2,
    random_state=SEED
)

In [19]:
random_search.fit(X_train, y_train)



Fitting 5 folds for each of 180 candidates, totalling 900 fits


RandomizedSearchCV(cv=5, estimator=MLPClassifier(), n_iter=200, n_jobs=-1,
                   param_distributions={'activation': ['relu', 'tanh',
                                                       'logistic'],
                                        'alpha': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00]),
                                        'hidden_layer_sizes': [(50,), (100,),
                                                               (50, 50),
                                                               (100, 50)],
                                        'learning_rate': ['constant',
                                                          'invscaling',
                                                          'adaptive'],
                                        'max_iter': [1000]},
                   random_state=265123, verbose=2)

In [20]:
random_search.best_params_

{'max_iter': 1000,
 'learning_rate': 'invscaling',
 'hidden_layer_sizes': (100,),
 'alpha': 1.0,
 'activation': 'relu'}

In [21]:
random_search.best_score_

0.7577961815408969