### Imports

In [5]:
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

### Implementação classe modelo

#### pre_processing_data_average

+ Os 2 atributos escolhidos serão a média dos creditos feitos por semestre e a média das notas de cada semestre
  + Estes atributos foram escolhidos por ser os atributos que mais provavelmente irão afetar a "Failure"

In [40]:
class modelo:

    # Remove id column (is not relevant in this case)
    def pre_processing_data(data_train):
        X = data_train.drop('Failure', axis=1).drop('Id', axis=1)
        y = data_train.Failure

        return X, y

    def pre_processing_data_average(data_train):
        classifications = data_train.drop('Failure', axis=1).drop('Id', axis=1).drop('Program', axis=1).drop(
            list(data_train.filter(regex='enrol')), axis=1).drop(list(data_train.filter(regex='complete')), axis=1)
        ects = data_train.drop('Failure', axis=1).drop('Id', axis=1).drop('Program', axis=1).drop(list(
            data_train.filter(regex='enrol')), axis=1).drop(list(data_train.filter(regex='grade')), axis=1)

        data_train['Classifications_mean'] = classifications.mean(axis=1)

        data_train['ects'] = ects.mean(axis=1)

        X = pd.concat([data_train.pop(x)
                      for x in ['Classifications_mean', 'ects']], axis=1)
        y = data_train.Failure

        return X, y

    def predict(X_train, X_test, y_train, y_test):
        #model = DecisionTreeClassifier()
        #model.fit(X_train, y_train)

        # print prediction results
        #predictions = model.predict(X_test)
        #print(classification_report(y_test, predictions, zero_division=1))

        # defining parameter range
        # tira-mos 1 e 2 pois a cobertura da segunda parte era baixa
        param_grid = {'max_depth': [4, 6, 8, 10],
                      'min_samples_split': [2, 4, 6, 8, 10]}

        grid = GridSearchCV(DecisionTreeClassifier(), param_grid,
                            refit=True, verbose=0, n_jobs=-1)

        # fitting the model for grid search
        grid.fit(X_train, y_train)

        # print best parameter after tuning
        print(grid.best_params_)
        grid_predictions = grid.predict(X_test)

        # precision
        print(
            f"Precisão: {precision_score(y_test, grid_predictions, average=None)[1]}")

        # Recall
        print(
            f"Cobertura: {recall_score(y_test, grid_predictions, average=None)[1]}")

        # print classification report
        print(classification_report(y_test, grid_predictions, zero_division=1))


### Predict do modelo, testes de precisão e cobertura

In [41]:
data_train = pd.read_csv("test-files/dropout-trabalho2.csv")
modelo = modelo

# Primeira Parte
X, y= modelo.pre_processing_data(data_train)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 1) 
modelo.predict(X_train, X_test, y_train, y_test)

# Segunda parte
X, y = modelo.pre_processing_data_average(data_train)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 1) 
modelo.predict(X_train, X_test, y_train, y_test)



{'max_depth': 4, 'min_samples_split': 2}
Precisão: 0.8363636363636363
Cobertura: 0.8846153846153846
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       318
           1       0.84      0.88      0.86       104

    accuracy                           0.93       422
   macro avg       0.90      0.91      0.91       422
weighted avg       0.93      0.93      0.93       422

{'max_depth': 4, 'min_samples_split': 8}
Precisão: 0.92
Cobertura: 0.8846153846153846
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       318
           1       0.92      0.88      0.90       104

    accuracy                           0.95       422
   macro avg       0.94      0.93      0.94       422
weighted avg       0.95      0.95      0.95       422



#### GridSearchCV using Support Vector Machine

In [None]:
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.datasets import load_breast_cancer 
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd

#load the dataset and split it into training and testing sets
data_train = pd.read_csv("test-files/dropout-trabalho2.csv")
X = data_train.drop('Failure', axis=1)
y = data_train.Failure
#print(X)
#print(y)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 1) 
# train the model on train set without using GridSearchCV 
model = SVC() 
model.fit(X_train, y_train) 

# print prediction results 
predictions = model.predict(X_test) 
print(classification_report(y_test, predictions, zero_division=1)) 

# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100],  
    'gamma': [1, 0.1, 0.01],
    'kernel': ['linear']}  

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3,n_jobs=-1) 

# fitting the model for grid search 
grid.fit(X_train, y_train) 

# print best parameter after tuning 
print(grid.best_params_) 
grid_predictions = grid.predict(X_test) 

# print classification report 
print(classification_report(y_test, grid_predictions)) 

#### GridSearchCV using DecisionTrees

In [None]:
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd

#load the dataset and split it into training and testing sets
data_train = pd.read_csv("test-files/dropout-trabalho2.csv")
X = data_train.drop('Failure', axis=1)
y = data_train.Failure
#print(X)
#print(y)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 1) 
# train the model on train set without using GridSearchCV 
model = DecisionTreeClassifier() 
model.fit(X_train, y_train) 

# print prediction results 
predictions = model.predict(X_test) 
print(classification_report(y_test, predictions, zero_division=1)) 

# defining parameter range 
param_grid = {'max_depth': [1, 2, 4, 6, 8, 10], 'min_samples_split': [2, 4, 6, 8, 10]}

grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1) 

# fitting the model for grid search 
grid.fit(X_train, y_train) 

# print best parameter after tuning 
print(grid.best_params_) 
grid_predictions = grid.predict(X_test) 

# print classification report 
print(classification_report(y_test, grid_predictions))


#### Classification with DecisionTrees using the parameters given above

In [None]:
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd

#load the dataset and split it into training and testing sets
data_train = pd.read_csv("test-files/dropout-trabalho2.csv")
X = data_train.drop('Failure', axis=1)
y = data_train.Failure

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 1) 

print(X)
print(y)

model = DecisionTreeClassifier() 
model.fit(X_train, y_train) 

# print prediction results 
predictions = model.predict(X_test) 
print(classification_report(y_test, predictions, zero_division=1)) 