In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas
import numpy as np
from sklearn import tree
import sklearn.linear_model as lm
from sklearn import preprocessing, model_selection, pipeline
from sklearn.model_selection import validation_curve
from sklearn import metrics
from sklearn.inspection import permutation_importance

from sklearn import svm



from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets


np.set_printoptions(precision=3)
pandas.set_option("display.precision", 3)



classification_scores = ['accuracy','precision_weighted',
                         'recall_weighted', 'roc_auc_ovr_weighted',
                         'f1_weighted']


regression_scores = ['explained_variance','neg_mean_absolute_error',
                     'neg_mean_absolute_percentage_error', 'r2',
                     'neg_root_mean_squared_error']

%matplotlib inline

# Classificacao

## Vinhos


### Leitura da Base 

In [2]:
df_wine = pandas.read_csv('../Data/dataset_vinhos.csv',sep=';')


wine_target_col = 'target'

wine_label_map = df_wine[['target', 'target_label']].drop_duplicates()

drop_cols = ['target_label']
df_wine.drop(drop_cols, axis=1, inplace=True)
print(df_wine.shape)
df_wine.head()

(5320, 13)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.998,3.51,0.56,9.4,0,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.997,3.2,0.68,9.8,0,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1,red
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.998,3.51,0.56,9.4,0,red


### Transformação Variáveis Categóricas

In [3]:
categorical_cols = ['type']
encoder_map = {}

for cname in categorical_cols:
    encoder = preprocessing.OneHotEncoder(sparse=False)
    transformed = encoder.fit_transform(df_wine[[cname]])
    ohe_df = pandas.DataFrame(transformed, columns=[cname+'_'+cat for cat in encoder.categories_[0]])
    encoder_map[cname] = encoder
    
    df_wine = pandas.concat([df_wine, ohe_df], axis=1).drop(cname, axis=1)
df_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target,type_red,type_white
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.998,3.51,0.56,9.4,0,1.0,0.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.997,3.2,0.68,9.8,0,1.0,0.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0,1.0,0.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1,1.0,0.0
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.998,3.51,0.56,9.4,0,1.0,0.0


## Treinamento dos modelos

In [4]:
Y = df_wine[wine_target_col]
X = df_wine.drop(wine_target_col, axis=1)

# train/test
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X, Y, test_size=0.2)

cvfold = model_selection.StratifiedKFold(n_splits = 10, random_state = 0, shuffle=True)


### Regressão Logística 

#### Curva de Validação-Cruzada 

In [6]:

grid_search_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel':['linear', 'rbf', 'poly'],
}


def plot_parameter_validation_curve(param_name, grid_search,
                                    model, model_name, scoring,
                                    logx):
    print('Parameter:', param_name)
    print('GridSearch:', grid_search[param_name])
    print('Scoring:', scoring)
    plt.figure(figsize=(6,4))
    train_scores, test_scores = validation_curve(model,
                                                 X = xtrain, 
                                                 y = ytrain, 
                                                 param_name=param_name, 
                                                 param_range= grid_search[param_name],
                                                 scoring=scoring,
                                                 cv=cvfold,
                                                 n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Curva Validação Modelo " + model_name)
    plt.xlabel(param_name)
    plt.ylabel("Score ("+scoring+")")
    if logx:
        plt.semilogx(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.semilogx(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                     color="navy", lw=2)
    else:
        plt.plot(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.plot(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                 color="navy", lw=2)
    plt.fill_between(grid_search[param_name], train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=2)
    plt.fill_between(grid_search[param_name], test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=2)
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()
    
    
#     
model_wine = svm.SVC(kernel='linear', C=0.1, probability=True)
    
    
interact(plot_parameter_validation_curve,
         param_name = list(grid_search_lr.keys()), 
         grid_search = fixed(grid_search_lr),
         model = fixed(svm.SVC()),
         model_name = fixed('SVM'),
         scoring = classification_scores,
         logx = True
)


interactive(children=(Dropdown(description='param_name', options=('C', 'kernel'), value='C'), Dropdown(descrip…

<function __main__.plot_parameter_validation_curve(param_name, grid_search, model, model_name, scoring, logx)>

In [6]:

grid_search_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none']
}


def plot_parameter_validation_curve(param_name, grid_search,
                                    model, model_name, scoring,
                                    logx):
    print('Parameter:', param_name)
    print('GridSearch:', grid_search[param_name])
    print('Scoring:', scoring)
    plt.figure(figsize=(6,4))
    train_scores, test_scores = validation_curve(model,
                                                 X = xtrain, 
                                                 y = ytrain, 
                                                 param_name=param_name, 
                                                 param_range= grid_search[param_name],
                                                 scoring=scoring,
                                                 cv=cvfold,
                                                 n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Curva Validação Modelo " + model_name)
    plt.xlabel(param_name)
    plt.ylabel("Score ("+scoring+")")
    if logx:
        plt.semilogx(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.semilogx(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                     color="navy", lw=2)
    else:
        plt.plot(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.plot(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                 color="navy", lw=2)
    plt.fill_between(grid_search[param_name], train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=2)
    plt.fill_between(grid_search[param_name], test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=2)
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()
    
interact(plot_parameter_validation_curve,
         param_name = list(grid_search_lr.keys()), 
         grid_search = fixed(grid_search_lr),
         model = fixed(lm.LogisticRegression(solver='liblinear')),
         model_name = fixed('Regressão Logística'),
         scoring = classification_scores,
         logx = True
)


interactive(children=(Dropdown(description='param_name', options=('C', 'penalty'), value='C'), Dropdown(descri…

<function __main__.plot_parameter_validation_curve(param_name, grid_search, model, model_name, scoring, logx)>

#### Curva de Aprendizado 

In [7]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


train_sizes = np.linspace(.1, 1.0, 10)


def plot_learning_curve(model, model_name, scoring, train_sizes):
    plt.figure(figsize=(6,4))
    ax = plt.gca()
        
    ax.set_title('Curva de Aprendizado (%s)'%model_name)
    ax.set_xlabel("Exemplos do Treino")
    ax.set_ylabel("Score (" + scoring + ")")

    train_sizes, train_scores, test_scores = learning_curve(model,
                                                            X = xtrain,
                                                            y = ytrain,
                                                            cv=cvfold,
                                                            n_jobs=-1,
                                                            train_sizes=train_sizes,
                                                            scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    # Plot learning curve
    ax.grid()
    ax.plot(train_sizes, train_scores_mean, 'o-', color="darkorange",
                 label="Treino")
    ax.plot(train_sizes, test_scores_mean, 'o-', color="navy",
                 label="Validação-cruzada")
    ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="darkorange")
    ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="navy")
    ax.legend(loc="best")
    plt.show()

    
interact(plot_learning_curve,
         model = fixed(lm.LogisticRegression(solver='liblinear', C=1)),
         model_name = fixed('Regressão Logística'),
         scoring = classification_scores,
         train_sizes=fixed(train_sizes)
)


interactive(children=(Dropdown(description='scoring', options=('accuracy', 'precision_weighted', 'recall_weigh…

<function __main__.plot_learning_curve(model, model_name, scoring, train_sizes)>

### Árvore de Decisão 

#### Curva de Validação-Cruzada 

In [8]:

grid_search_dt = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 4, 6, 8],
    'max_features': ["auto", "sqrt", "log2"],
}


interact(plot_parameter_validation_curve,
         param_name = list(grid_search_dt.keys()), 
         grid_search = fixed(grid_search_dt),
         model = fixed(tree.DecisionTreeClassifier()),
         model_name = fixed('Árvore de Decisão'),
         scoring = classification_scores,
         logx = False
)


interactive(children=(Dropdown(description='param_name', options=('max_depth', 'criterion', 'min_samples_split…

<function __main__.plot_parameter_validation_curve(param_name, grid_search, model, model_name, scoring, logx)>

#### Curva de Aprendizado 

In [9]:
train_sizes = np.linspace(.1, 1.0, 10)
    

interact(plot_learning_curve,
         model = fixed(tree.DecisionTreeClassifier(max_depth=5,
                                                   )),
         model_name = fixed('Árvore de Decisão'),
         scoring = classification_scores,
         train_sizes=fixed(train_sizes)
)


interactive(children=(Dropdown(description='scoring', options=('accuracy', 'precision_weighted', 'recall_weigh…

<function __main__.plot_learning_curve(model, model_name, scoring, train_sizes)>

## Íris 

## Leitura da Base 

In [10]:
df_iris = pandas.read_csv('../Data/dataset_iris.csv',sep=';')

iris_label_map = df_iris[['target', 'target_label']].drop_duplicates()

drop_cols = ['target_label']
df_iris.drop(drop_cols, axis=1, inplace=True)

print(df_iris.shape)

iris_target_col = 'target'


df_iris.head()

(150, 5)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Treinamento dos Modelos 

In [11]:

Y = df_iris[iris_target_col]
X = df_iris.drop(iris_target_col, axis=1)
iris_feature_names = list(X.columns)

# train/test
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X, Y, test_size=0.2)

cvfold = model_selection.StratifiedKFold(n_splits = 10, random_state = 0, shuffle=True)

### Regressão Logística 

#### Curva de Validação-Cruzada 

In [12]:

grid_search_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none']
}

interact(plot_parameter_validation_curve,
         param_name = list(grid_search_lr.keys()), 
         grid_search = fixed(grid_search_lr),
         model = fixed(lm.LogisticRegression(solver='liblinear')),
         model_name = fixed('Regressão Logística'),
         scoring = classification_scores,
         logx = True
)


interactive(children=(Dropdown(description='param_name', options=('C', 'penalty'), value='C'), Dropdown(descri…

<function __main__.plot_parameter_validation_curve(param_name, grid_search, model, model_name, scoring, logx)>

#### Curva de Aprendizado 

In [13]:
train_sizes = np.linspace(.1, 1.0, 10)
    
interact(plot_learning_curve,
         model = fixed(lm.LogisticRegression(solver='liblinear', C=1)),
         model_name = fixed('Regressão Logística'),
         scoring = classification_scores,
         train_sizes=fixed(train_sizes)
)


interactive(children=(Dropdown(description='scoring', options=('accuracy', 'precision_weighted', 'recall_weigh…

<function __main__.plot_learning_curve(model, model_name, scoring, train_sizes)>

### Árvore de Decisão 

#### Curva de Validação-Cruzada 

In [14]:
grid_search_dt = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 4, 6, 8],
    'max_features': ["auto", "sqrt", "log2"],
}

interact(plot_parameter_validation_curve,
         param_name = list(grid_search_dt.keys()), 
         grid_search = fixed(grid_search_dt),
         model = fixed(tree.DecisionTreeClassifier()),
         model_name = fixed('Árvore de Decisão'),
         scoring = classification_scores,
         logx = False
)

interactive(children=(Dropdown(description='param_name', options=('max_depth', 'criterion', 'min_samples_split…

<function __main__.plot_parameter_validation_curve(param_name, grid_search, model, model_name, scoring, logx)>

#### Curva de Aprendizado 

In [15]:
train_sizes = np.linspace(.1, 1.0, 10)
    

interact(plot_learning_curve,
         model = fixed(tree.DecisionTreeClassifier(max_depth=2,
                                                   )),
         model_name = fixed('Árvore de Decisão'),
         scoring = classification_scores,
         train_sizes=fixed(train_sizes)
)


interactive(children=(Dropdown(description='scoring', options=('accuracy', 'precision_weighted', 'recall_weigh…

<function __main__.plot_learning_curve(model, model_name, scoring, train_sizes)>

### Distribuições 

In [16]:
df = df_wine_tr.merge(wine_label_map, on='target')
ax = sns.jointplot(data=df, x='alcohol',y='volatile acidity',
              hue='target_label', kind='kde')

ax.ax_joint.axvline(x=10.725, linestyle='--', color='k')
ax.ax_joint.axhline(y=0.287, linestyle='--', color='k')


NameError: name 'df_wine_tr' is not defined

### Caminho da Árvore 

In [None]:
from dtreeviz.trees import dtreeviz # remember to load the package

viz = dtreeviz(model_wine, df_wine_tr[wine_feature_names],
               df_wine_tr[wine_target_col],
                target_name="Qualidade Vinho",
                feature_names=wine_feature_names,
                class_names=list(wine_label_map.target_label.values))

viz.save("./teste.svg")
viz

### Importância das Variáveis - Entropia

In [None]:
plt.figure(figsize=(15,6))


plt.subplot(1,2,1)
idx = np.argsort(model_wine.feature_importances_)

plt.barh(np.arange(idx.shape[0]), model_wine.feature_importances_[idx]*100)

plt.yticks(np.arange(idx.shape[0]), np.array(wine_feature_names)[idx])
plt.title('Importância das Variáveis por Entropia')
plt.grid()
plt.xlabel('Importância Relativa [%]')
plt.tight_layout()

plt.subplot(1,2,2)
r = permutation_importance(model_wine, xtest, ytest,
                           n_repeats=30,
                        random_state=0)
importance =  r.importances_mean
idx = np.argsort(importance)
plt.barh(np.arange(idx.shape[0]), importance[idx]*100)
plt.yticks(np.arange(idx.shape[0]), np.array(wine_feature_names)[idx])
plt.title('Importância das Variáveis por Permutação')
plt.grid()
plt.xlabel('Importância Relativa [%]')
plt.tight_layout()

# Regressao 

## Clima 

### Leitura dos dados 

In [None]:
df_weather = pandas.read_csv('../Data/dataset_clima.csv',sep=';')

drop_cols = ['Temperature (C)']
df_weather.drop(drop_cols, axis=1, inplace=True)

print(df_weather.shape)

weather_target_col = 'Apparent Temperature (C)'

df_weather.head()

### Treinamento dos Modelos 

In [None]:

Y = df_weather[weather_target_col]
X = df_weather.drop(weather_target_col, axis=1)
weather_feature_names = list(X.columns)

# train/test
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X, Y, test_size=0.2)


cvfold = model_selection.KFold(n_splits = 10, random_state = 0, shuffle=True)

### Regressão Linear 

#### Curva de Validação-Cruzada 

In [None]:
grid_search_el = {
    'alpha': [0.001, 0.01, 0.1, 1],
    'l1_ratio': np.linspace(0, 1, 5),
    'selection': ['cyclic', 'random'],
}

interact(plot_parameter_validation_curve,
         param_name = list(grid_search_el.keys()), 
         grid_search = fixed(grid_search_el),
         model = fixed(lm.ElasticNet()),
         model_name = fixed('ElasticNet'),
         scoring = regression_scores,
         logx = True
)

#### Curva de Aprendizado 

In [None]:
train_sizes = np.linspace(.1, 1.0, 10)
    

interact(plot_learning_curve,
         model = fixed(lm.ElasticNet(alpha=1e-3)),
         model_name = fixed('ElasticNet'),
         scoring = regression_scores,
         train_sizes=fixed(train_sizes)
)


### Árvore de Regressão 

#### Curva de Validação-Cruzada 

In [None]:
grid_search_dt = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 4, 6, 8],
    'max_features': ["auto", "sqrt", "log2"],
}

interact(plot_parameter_validation_curve,
         param_name = list(grid_search_dt.keys()), 
         grid_search = fixed(grid_search_dt),
         model = fixed(tree.DecisionTreeRegressor()),
         model_name = fixed('Árvore de Decisão'),
         scoring = regression_scores,
         logx = False
)

#### Curva de Aprendizado 

In [None]:
train_sizes = np.linspace(.1, 1.0, 10)
    

interact(plot_learning_curve,
         model = fixed(tree.DecisionTreeRegressor(max_depth=7)),
         model_name = fixed('Árvore de Decisão'),
         scoring = regression_scores,
         train_sizes=fixed(train_sizes)
)


## Automóveis 

### Leitura dos Dados 

In [None]:
df_auto = pandas.read_csv('../Data/dataset_auto.csv',sep=';')
df_auto['model'] = df_auto['brand'].astype(str) + df_auto['model'].astype(str)

auto_target_col = 'price'

drop_cols = ['brand','tax']
df_auto.drop(drop_cols, axis=1, inplace=True)
print(df_auto.shape)
df_auto.head()

### Variáveis Categóricas 

In [None]:
categorical_cols = ['transmission', 'fuelType', 'model']
encoder_map = {}

for cname in categorical_cols:
    encoder = preprocessing.OneHotEncoder(sparse=False)
    transformed = encoder.fit_transform(df_auto[[cname]])
    ohe_df = pandas.DataFrame(transformed, columns=[cname+'_'+cat for cat in encoder.categories_[0]])
    encoder_map[cname] = encoder
    
    df_auto = pandas.concat([df_auto, ohe_df], axis=1).drop(cname, axis=1)
df_auto.head()

### Treinamento dos Modelos 

In [None]:

Y = df_auto[auto_target_col]
X = df_auto.drop(auto_target_col, axis=1)
auto_feature_names = list(X.columns)

# train/test
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X, Y, test_size=0.2)

cvfold = model_selection.KFold(n_splits = 10, random_state = 0, shuffle=True)


### Regressão Linear 

#### Curva de Validação 

In [None]:
grid_search_el = {
    'alpha': [0.001, 0.01, 0.1, 1],
    'l1_ratio': np.linspace(0, 1, 5),
    'selection': ['cyclic', 'random'],
}

interact(plot_parameter_validation_curve,
         param_name = list(grid_search_el.keys()), 
         grid_search = fixed(grid_search_el),
         model = fixed(lm.ElasticNet()),
         model_name = fixed('ElasticNet'),
         scoring = regression_scores,
         logx = True
)

#### Curva de Aprendizado 

In [None]:
train_sizes = np.linspace(.1, 1.0, 10)
    

interact(plot_learning_curve,
         model = fixed(lm.ElasticNet(alpha=1e-3)),
         model_name = fixed('ElasticNet'),
         scoring = regression_scores,
         train_sizes=fixed(train_sizes)
)


### Árvore de Regressão 

#### Curva de Validação-Cruzada 

In [None]:
grid_search_dt = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 4, 6, 8],
    'max_features': ["auto", "sqrt", "log2"],
}

interact(plot_parameter_validation_curve,
         param_name = list(grid_search_dt.keys()), 
         grid_search = fixed(grid_search_dt),
         model = fixed(tree.DecisionTreeRegressor()),
         model_name = fixed('Árvore de Decisão'),
         scoring = regression_scores,
         logx = False
)

#### Curva de Aprendizado 

In [None]:
train_sizes = np.linspace(.1, 1.0, 10)
    

interact(plot_learning_curve,
         model = fixed(tree.DecisionTreeRegressor(max_depth=9)),
         model_name = fixed('Árvore de Decisão'),
         scoring = regression_scores,
         train_sizes=fixed(train_sizes)
)
