# Bibliotecas 

In [1]:
import pandas
import numpy as np
import sklearn.linear_model as lm
from sklearn import preprocessing, model_selection, metrics
import matplotlib.pyplot as plt

from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets


# Vinhos 

In [2]:
df_wine = pandas.read_csv('../Data/dataset_vinhos.csv',sep=';')

drop_cols = ['target_label']
df_wine.drop(drop_cols, axis=1, inplace=True)

print(df_wine.shape)

target_col = 'target'

df_wine.head()

(5320, 13)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1,red
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0,red


## Variáveis Categóricas 

In [3]:
categorical_cols = ['type']
encoder_map = {}

for cname in categorical_cols:
    encoder = preprocessing.OneHotEncoder(sparse=False)
    transformed = encoder.fit_transform(df_wine[[cname]])
    ohe_df = pandas.DataFrame(transformed, columns=[cname+'_'+cat for cat in encoder.categories_[0]])
    encoder_map[cname] = encoder
    
    df_wine = pandas.concat([df_wine, ohe_df], axis=1).drop(cname, axis=1)
df_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target,type_red,type_white
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,1.0,0.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0,1.0,0.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0,1.0,0.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1,1.0,0.0
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0,1.0,0.0


## Treinamento do modelo 

In [4]:
model_map = {
    'log_l1': lm.LogisticRegression(penalty='l1', solver='liblinear'),
    'log_l2': lm.LogisticRegression(penalty='l2', solver='liblinear'),
    'log_el': lm.LogisticRegression(penalty='elasticnet',l1_ratio= 0.4, solver='saga'),
}

# solver : 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'

Y = df_wine[target_col]
X = df_wine.drop(target_col, axis=1)

# train/test
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X, Y, test_size=0.2)

df_train = xtrain.copy()
df_test = xtest.copy()
df_train['train_set'] = 1
df_test['train_set'] = 0
df_train[target_col] = ytrain
df_test[target_col] = ytest

for modelname, model in model_map.items():
    model.fit(xtrain, ytrain)      
    yhat_train = model.predict(xtrain)
    yhat_test  = model.predict(xtest)
    # Output table
    df_train[modelname] = yhat_train
    df_test[modelname] = yhat_test
    model_map[modelname] = model
    
df_wine = pandas.concat((df_train, df_test), axis=0)



## Resultados 

In [8]:
idx = df_wine['train_set'] == 0

print('log_l1\t', metrics.accuracy_score(df_wine[idx][target_col], df_wine[idx]['log_l1']>0.5))
print('log_l2\t', metrics.accuracy_score(df_wine[idx][target_col], df_wine[idx]['log_l2']>0.5))
print('log_el\t', metrics.accuracy_score(df_wine[idx][target_col], df_wine[idx]['log_el']>0.5))
print('Total Real Alta Qualidade:', df_wine[idx][target_col].sum())
df_wine[idx].groupby(target_col)[list(model_map.keys())].sum()


log_l1	 0.7443609022556391
log_l2	 0.7471804511278195
log_el	 0.6400375939849624
Total Real Alta Qualidade: 662


Unnamed: 0_level_0,log_l1,log_l2,log_el
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,158,157,320
1,548,550,599


## Coeficientes do Modelo

In [9]:
def plot_coefs(modelname, class_label):
    fig = plt.figure(figsize=(15,4))
    
    iclass = 0
    if class_label != '':
        idx  = target_label_map.target_label==class_label
        iclass = target_label_map.loc[idx, 'target'].values[0]
    
    coefs = model_map[modelname].coef_[iclass]
    coef_names = xtrain.columns

    idx = np.argsort(coefs)

    plt.bar(np.arange(coefs.shape[0]), coefs[idx])
    plt.xticks(np.arange(coefs.shape[0]), coef_names[idx], rotation='90')
    plt.title('Coeficientes do modelo '+ modelname + ' ' + class_label)
    plt.grid()
    plt.xlim((-0.5, coefs.shape[0]-0.5))
    plt.show()


interact(plot_coefs, modelname = list(model_map.keys()), class_label = fixed(''))

interactive(children=(Dropdown(description='modelname', options=('log_l1', 'log_l2', 'log_el'), value='log_l1'…

<function __main__.plot_coefs(modelname, class_label)>

# IRIS 

In [10]:
df_iris = pandas.read_csv('../Data/dataset_iris.csv',sep=';')

target_label_map = df_iris[['target', 'target_label']].drop_duplicates()

drop_cols = ['target_label']
df_iris.drop(drop_cols, axis=1, inplace=True)

print(df_iris.shape)

target_col = 'target'


df_iris.head()

(150, 5)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Treinamento do Modelo 

In [11]:
model_map = {
    'log_l1': lm.LogisticRegression(penalty='l1', solver='liblinear'),
    'log_l2': lm.LogisticRegression(penalty='l2', solver='liblinear'),
    'log_el': lm.LogisticRegression(penalty='elasticnet',l1_ratio= 0.4, solver='saga'),
}

# solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'

Y = df_iris[target_col]
X = df_iris.drop(target_col, axis=1)

# train/test
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X, Y, test_size=0.4)

df_train = xtrain.copy()
df_test = xtest.copy()
df_train['train_set'] = 1
df_test['train_set'] = 0
df_train[target_col] = ytrain
df_test[target_col] = ytest

for modelname, model in model_map.items():
    model.fit(xtrain, ytrain)      
    yhat_train = model.predict(xtrain)
    yhat_test  = model.predict(xtest)
    # Output table
    df_train[modelname] = yhat_train
    df_test[modelname] = yhat_test
    model_map[modelname] = model
    
df_iris = pandas.concat((df_train, df_test), axis=0)



## Resultados 

In [12]:
idx = df_iris['train_set'] == 0
print('log_l1\t', metrics.accuracy_score(df_iris[idx][target_col], df_iris[idx]['log_l1']))
print('log_l2\t', metrics.accuracy_score(df_iris[idx][target_col], df_iris[idx]['log_l2']))
print('log_el\t', metrics.accuracy_score(df_iris[idx][target_col], df_iris[idx]['log_el']))

df_list = []
idx = df_iris['train_set'] == 0
for modelname in model_map.keys():
    cm = metrics.confusion_matrix(df_iris[idx][target_col], df_iris[idx][modelname])
    df = pandas.DataFrame(cm, columns = target_label_map.target_label)
    df = df.add_prefix(modelname)
    df_list.append(df)
df = pandas.concat(df_list, axis=1)
df.index = target_label_map.target_label
df.T

log_l1	 0.9333333333333333
log_l2	 0.9333333333333333
log_el	 0.95


target_label,setosa,versicolor,virginica
target_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
log_l1setosa,26,0,0
log_l1versicolor,0,16,1
log_l1virginica,0,3,14
log_l2setosa,26,0,0
log_l2versicolor,0,16,1
log_l2virginica,0,3,14
log_elsetosa,26,0,0
log_elversicolor,0,16,0
log_elvirginica,0,3,15


## Coeficientes do Modelo 

In [13]:

interact(plot_coefs, modelname = list(model_map.keys()),
         class_label = target_label_map.target_label)

interactive(children=(Dropdown(description='modelname', options=('log_l1', 'log_l2', 'log_el'), value='log_l1'…

<function __main__.plot_coefs(modelname, class_label)>