# Summary
Trabalho 4
<br>Objetivos: aplicar os vários classificadores visto em aula. Busca de hiperparametros. Nested cross validation

Membros do Grupo:
*   Isaque Elcio de Souza — RA: 225310
*   Matheus Vinicius Correa — RA: 225241
*   Thiago Bruschi Martins — RA: 120212




# Preprocessing

In [1]:
import io
import requests
import numpy as np
import pandas as pd

# Select categoricals columns of the input data
dtypes = {}
categoricals = ['V1', 'V4','V5','V6','V8','V9','V11','V12']
for col in categoricals:
  dtypes[col] = 'category'

# Read data and convert the categorical columns
url = "https://www.ic.unicamp.br/~wainer/cursos/1s2021/432/dados4.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')), dtype=dtypes)
print(df[categoricals].describe())

# Create dummy variables and detach the output from the input data 
dummy = pd.get_dummies(df, drop_first=True)
X = dummy.drop(['V15'], axis=1)
y = dummy['V15']

         V1   V4   V5   V6   V8   V9  V11  V12
count   690  690  690  690  690  690  690  690
unique    2    3   14    8    2    2    2    3
top       1    2    c   bb    1    0    0    g
freq    468  525  146  408  361  395  374  625


# Functions

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

results = {}

def round_dict(d, decimals):
  print(d)
  for keys, values in d.items():
    if type(d[keys] == float):
      d[keys] = round(values, decimals)
  return d

# Test 10 random parameters in the estimator (model)
def random_search(estimator, param_distributions, X, y):
  if(len(param_distributions) > 0):
    rscv = RandomizedSearchCV(estimator(), param_distributions=param_distributions, scoring='roc_auc', n_iter=min(10,len(param_distributions)), cv=3, n_jobs=-1)
    r = rscv.fit(X,y)   
    #round_dict(r.best_params_, 7)
    return r.best_score_,  r.best_params_ 
  else:
    r = cross_val_score(estimator(), X, y, cv=3, scoring='roc_auc')
    return np.mean(r), {}  

# Test the estimator, save the best score and best params into results
def model_test(estimator, params, X, y):
  name = type(estimator()).__name__
  if len(params) == 0:
    name = name + '_default'
  best_score, best_params = random_search(estimator, params, X, y)

  if name not in results:
    results[name] = {}
    results[name]['ValidationScore'] = best_score
    results[name]['BestParams'] = best_params
  else:
    if best_score > results[name]['ValidationScore']:
      results[name]['ValidationScore'] = best_score
      results[name]['BestParams'] = best_params

  return name, best_params

# Nested Cross Validation
Search for hiperpams using nested cross validation

In [3]:
import random
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from scipy.stats import loguniform
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

results = {}
OUTER_LOOP = 4

for _ in range(OUTER_LOOP):  # Outter loop: split the data into train and test data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)

  # Define the params of all classes
  logistic_params = {'C': loguniform(10e-3, 10e3)} 
  linear_svm_param = {'C': loguniform(2e-15,2e15)}  
  svm_param = {'C':loguniform(2e-15,2e15),'gamma':loguniform(2e-9,2e3)} 
  MLP_params = {'hidden_layer_sizes':(5,8,11,14,17,20) } 
  dtree_params = {'ccp_alpha':[random.uniform(0.0, 0.4) for i in range(10)]} 
  rf_params = {'n_estimators':[10, 100, 1000], 'max_features':[5, 10, 22] } 
  gbm_params = {
    'n_estimators': np.random.randint(5, 100, 10),
    'max_features':[random.uniform(0.01, 0.3) for i in range(10)],
    'max_depth':[2, 3]
    }

  # Define the tuple as (model class, params)
  models = [(LogisticRegression, {}), (LogisticRegression, logistic_params), 
            (LinearDiscriminantAnalysis, {}), (QuadraticDiscriminantAnalysis,{}),
            (LinearSVC, linear_svm_param), (SVC, svm_param), (GaussianNB, {}), 
            (MLPClassifier, MLP_params), (DecisionTreeClassifier, dtree_params), 
            (RandomForestClassifier, rf_params), (GradientBoostingClassifier, gbm_params)]
  
  # InnerLoop: Train, validadte and test each model
  for estimator, params in models:
    name, best_params = model_test(estimator, params, X_train_scaled, y_train)
    model = estimator(**best_params)
    model.fit(X_train, y_train) # Train the best estimator with
    score = roc_auc_score(y_test, model.predict(X_test))
    
    # Save the score as a sum
    if 'TestScore' not in results[name]:
      results[name]['TestScore'] = score
    else:     
        results[name]['TestScore'] += score

# The final score is the avegare of the test scores
for name in results:
  results[name]['TestScore'] /= OUTER_LOOP

# Final Report

In [5]:
# Final report
report = pd.DataFrame(results).transpose().sort_values(by='TestScore', ascending=False)
report

Unnamed: 0,ValidationScore,BestParams,TestScore
LinearDiscriminantAnalysis_default,0.922696,{},0.881834
LogisticRegression_default,0.915589,{},0.868607
LogisticRegression,0.91926,{'C': 0.029404889683009933},0.867284
DecisionTreeClassifier,0.861483,{'ccp_alpha': 0.22896682801621282},0.861552
RandomForestClassifier,0.933157,"{'n_estimators': 1000, 'max_features': 5}",0.854497
GradientBoostingClassifier,0.942377,"{'n_estimators': 63, 'max_features': 0.1929853...",0.853505
GaussianNB_default,0.850923,{},0.810406
QuadraticDiscriminantAnalysis_default,0.807591,{},0.766755
MLPClassifier,0.897255,{'hidden_layer_sizes': 8},0.743717
LinearSVC,0.911667,{'C': 3.667127741742828e-15},0.668871
