<a href="https://colab.research.google.com/github/vcwild/imersao-dados/blob/main/Aula_05_Imersao_Dados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Set up default environment

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

# set options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
warnings.filterwarnings('ignore')

# set seed
seed = 42

## Download, unzip dataset, rename it and remove source

In [2]:
import zipfile

!wget https://github.com/alura-cursos/imersao-dados-2-2020/blob/master/MICRODADOS_ENEM_2019_SAMPLE_43278.csv?raw=true --no-check-certificate

# file = "df.csv.zip"
# save_path = "./"

# with zipfile.ZipFile(file, 'r') as zip_ref:
#     zip_ref.extractall(save_path)

# ! rm df.csv.zip && mv df.csv.txt df.csv
! mv "MICRODADOS_ENEM_2019_SAMPLE_43278.csv?raw=true" "microdados.csv"

--2020-10-24 04:13:39--  https://github.com/alura-cursos/imersao-dados-2-2020/blob/master/MICRODADOS_ENEM_2019_SAMPLE_43278.csv?raw=true
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/alura-cursos/imersao-dados-2-2020/raw/master/MICRODADOS_ENEM_2019_SAMPLE_43278.csv [following]
--2020-10-24 04:13:40--  https://github.com/alura-cursos/imersao-dados-2-2020/raw/master/MICRODADOS_ENEM_2019_SAMPLE_43278.csv
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alura-cursos/imersao-dados-2-2020/master/MICRODADOS_ENEM_2019_SAMPLE_43278.csv [following]
--2020-10-24 04:13:40--  https://raw.githubusercontent.com/alura-cursos/imersao-dados-2-2020/master/MICRODADOS_ENEM_2019_SAMPLE_43278.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.1

## Import dataset


In [3]:
# df = pd.read_csv("df.csv", encoding="utf8", sep=";")
df = pd.read_csv("microdados.csv", encoding="utf8", sep=",")
df.columns.sort

<bound method Index.sort of Index(['NU_INSCRICAO', 'NU_ANO', 'CO_MUNICIPIO_RESIDENCIA',
       'NO_MUNICIPIO_RESIDENCIA', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA',
       'NU_IDADE', 'TP_SEXO', 'TP_ESTADO_CIVIL', 'TP_COR_RACA',
       ...
       'Q016', 'Q017', 'Q018', 'Q019', 'Q020', 'Q021', 'Q022', 'Q023', 'Q024',
       'Q025'],
      dtype='object', length=136)>

# Machine Learning

## Define features

In [4]:
exams = ["NU_NOTA_CN", "NU_NOTA_CH", "NU_NOTA_MT", "NU_NOTA_LC", "NU_NOTA_REDACAO"]
features = ["NU_NOTA_CH", "NU_NOTA_LC", "NU_NOTA_CN", "NU_NOTA_REDACAO"]
target = "NU_NOTA_MT"

exams_dropna = df[exams].dropna()
X = exams_dropna[features]
y = exams_dropna[target]

## Train-test split

In [5]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test =  tts(X, y, test_size=0.25, random_state=seed)

## Model Selection Using Grid Search

### Define Grid Search

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

seed = 42

pipeline = Pipeline([
    ('regressor', DecisionTreeRegressor())
])

search = [
    # {
    #     'regressor': [LinearSVR(random_state=seed)],
    #     'regressor__C': [0.5, 1, 3]
    # },
    {
        'regressor': [LinearRegression(n_jobs=-1)],
        'regressor__normalize': ['False', 'True']
    },
    # {
    #     'regressor': [Ridge(random_state=seed)],
    #     'regressor__alpha': [0.1, 0.5, 1],
    #     'regressor__normalize': ['False', 'True']
    # },
    # {
    #     'regressor': [Lasso(random_state=seed)],
    #     'regressor__alpha': [0.5, 1, 2],
    #     'regressor__normalize': ['False', 'True']
    # }
]

clf = GridSearchCV(pipeline, search, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

### Apply Grid Search

#### Train

In [7]:
best_fit = clf.fit(X_train, y_train)

best_fit.best_estimator_

Pipeline(memory=None,
         steps=[('regressor',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1,
                                  normalize='False'))],
         verbose=False)

In [8]:
best_fit.best_score_

-5962.738232609517

#### Test

In [9]:
prediction = best_fit.predict(X_test)

## Model Selection Using Cross Validation

### Apply Cross Validate

In [10]:
# define scoring model
def confidence_interval(cv):
  metrics = pd.DataFrame()
  metrics['mse'] = cv['test_score']*-1
  mean = metrics.mse.mean()
  cv_std = metrics.mse.std()
  lower_lim = mean - 2*cv_std
  upper_lim = mean + 2*cv_std
  print(f"Confidence interval between:\n {lower_lim:.3f}|---------|{upper_lim:.3f}")

In [11]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

# define model pipeline
def tree_regressor(level, metric='neg_mean_squared_error', splits=10, random_state=42, interval=False):

  seed = random_state

  kfold = KFold(n_splits = splits, shuffle=True)
  model = DecisionTreeRegressor(max_depth=level, random_state=seed)
  cv = cross_validate(model, X, y, cv=kfold, scoring=metric, n_jobs=-1, return_train_score=True)

  if interval:
    confidence_interval(cv)
  if metric == 'neg_mean_squared_error':
    print(f'{metric} = TRAIN {(cv["train_score"]*-1).mean():.5f} TEST {(cv["test_score"]*-1).mean():.5f}')
  else:
    print(f'{metric} = TRAIN {cv["train_score"].mean():.5f} TEST {(cv["test_score"]).mean():.5f}')

In [13]:
for i in range(1, 11):
  tree_regressor(level=i, interval=False, metric='r2')

r2 = TRAIN 0.34344 TEST 0.34273
r2 = TRAIN 0.45358 TEST 0.45167
r2 = TRAIN 0.49599 TEST 0.49203
r2 = TRAIN 0.51784 TEST 0.51345
r2 = TRAIN 0.53119 TEST 0.52429
r2 = TRAIN 0.54228 TEST 0.53245
r2 = TRAIN 0.55081 TEST 0.53709
r2 = TRAIN 0.55874 TEST 0.53723
r2 = TRAIN 0.56791 TEST 0.53179
r2 = TRAIN 0.57990 TEST 0.51911
