# Caso 4

In [63]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn import datasets
from IPython.display import display, HTML
from sklearn.neighbors import KNeighborsClassifier

In [64]:
def confusion_table(confusion_mtx):
    """Renders a nice confusion table with labels"""
    confusion_df = pd.DataFrame({'y_pred=0': np.append(confusion_mtx[:, 0], confusion_mtx.sum(axis=0)[0]),
                                 'y_pred=1': np.append(confusion_mtx[:, 1], confusion_mtx.sum(axis=0)[1]),
                                 'Total': np.append(confusion_mtx.sum(axis=1), ''),
                                 '': ['y=0', 'y=1', 'Total']}).set_index('')
    return confusion_df


def positive_observations(y):
    # What percentage of observations are positive?
    proportion_1 = ((y == 1).sum() / len(y))
    pct_1        = np.around(proportion_1*100, decimals=3)
    display(HTML('<p><h4>{}%</h4>of observations are positive</p>'.format(pct_1)))


# Classifier stats
# -------------------------------------------------

def prior_error_rate(confusion_matrix):
    """The prior probability that a result is positive"""
    return 1 - (np.sum(confusion_mtx[1, :]) / np.sum(confusion_mtx))

def total_error_rate(confusion_matrix):
    """Derive total error rate from confusion matrix"""
    return 1 - np.trace(confusion_mtx) / np.sum(confusion_mtx)

def true_positive_rate(confusion_mtx):
    """or sensitivity: the proportion of actual POSITIVES that are correctly identified as such"""
    return confusion_mtx[1, 1] / np.sum(confusion_mtx[1, :])

def false_negative_rate(confusion_mtx):
    """the proportion of actual POSITIVES that are incorrectly identified as negative"""
    return confusion_mtx[1, 0] / np.sum(confusion_mtx[1, :])

def false_positive_rate(confusion_mtx):
    """the proportion of actual NEGATIVES that are incorrectly identified as positives"""
    return confusion_mtx[0, 1] / np.sum(confusion_mtx[0, :])

def true_negative_rate(confusion_mtx):
    """or specificity: the proportion of actual NEGATIVES that are correctly identified as such"""
    return confusion_mtx[0, 0] / np.sum(confusion_mtx[0, :])

def positive_predictive_value(confusion_mtx):
    """or precision: the proportion of predicted positives that are correctly predicted"""
    return confusion_mtx[1, 1] / np.sum(confusion_mtx[:, 1])

def negative_predictive_value(confusion_mtx):
    """the proportion of predicted negatives that are correctly predicted"""
    return confusion_mtx[0, 0] / np.sum(confusion_mtx[:, 0])

def classifier_stats(confusion_mtx):
    return pd.Series({'prior_error_rate': prior_error_rate(confusion_mtx),
                      'total_error_rate': total_error_rate(confusion_mtx),
                      'true_positive_rate (sensitivity)': true_positive_rate(confusion_mtx),
                      'false_negative_rate': false_negative_rate(confusion_mtx),
                      'false_positive_rate': false_positive_rate(confusion_mtx),
                      'true_negative_rate (specificity)': true_negative_rate(confusion_mtx),
                      'positive_predictive_value (precision)': positive_predictive_value(confusion_mtx),
                      'negative_predictive_value': negative_predictive_value(confusion_mtx)})

In [74]:
auto_df = pd.read_csv('Auto.csv')

# Remove missing values
auto_df = auto_df.drop(auto_df[auto_df.values == '?'].index)
auto_df = auto_df.reset_index()

# Convert quantitive datatypes to numerics
datatypes = {'quant': ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin'],
             'qual': ['name']}

quants = auto_df[datatypes['quant']].astype(np.float_)
auto_df = pd.concat([quants, auto_df[datatypes['qual']]], axis=1) 


# Add mpg01 feature
mpg01   = (auto_df['mpg'] > auto_df['mpg'].median()*0.6).astype(np.float64)
auto_df = pd.concat([auto_df, mpg01.rename('mpg01')], axis=1).drop('mpg', axis=1)
auto_df.head()

display(auto_df.head())

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,mpg01
0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu,1.0
1,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320,1.0
2,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite,1.0
3,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst,1.0
4,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino,1.0


In [75]:

# Create index for training set
np.random.seed(1)
train_idx = np.random.rand(len(auto_df)) < 0.7
auto_df_train = auto_df[train_idx]
auto_df_test  = auto_df[~train_idx]

In [76]:
# Create index for holdout set
np.random.seed(1)
train = np.random.rand(len(auto_df)) < 0.7

predictors  = ['weight', 'cylinders', 'year', 'acceleration']
#predictors  = auto_df.columns.drop(['mpg01', 'name'])
X_train = np.array(auto_df[train][predictors])
y_train = np.array(auto_df[train]['mpg01'])
X_test  = np.array(auto_df[~train][predictors])
y_test  = np.array(auto_df[~train]['mpg01'])

# MODELS
# Logistic Regression
#model_logit = sm.Logit(y_train, X_train).fit() <--- this technique didn't converge
logit       = LogisticRegression()
model_logit = logit.fit(X_train, y_train)
# LDA
lda         = LinearDiscriminantAnalysis()
model_lda   = lda.fit(X_train, y_train)
# QDA
qda         = QuadraticDiscriminantAnalysis()
model_qda   = qda.fit(X_train, y_train)


models = {'logit': model_logit, 
          'lda': model_lda, 
          'qda': model_qda}
scaled = []


# PREDICT
for k in models:
    # Predict
    if k in scaled:
        y_pred = models[k].predict(preprocessing.scale(X_test))
    else:
        y_pred = models[k].predict(X_test)
    # Confusion table
    display(HTML('<h3>{}</h3>'.format(k)))
    confusion_mtx = confusion_matrix(y_test, y_pred)
    display(confusion_table(confusion_mtx))
    # Classifier stats
    display(classifier_stats(confusion_mtx))





Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,7.0,2.0,9.0
y=1,0.0,111.0,111.0
Total,7.0,113.0,


prior_error_rate                         0.075000
total_error_rate                         0.016667
true_positive_rate (sensitivity)         1.000000
false_negative_rate                      0.000000
false_positive_rate                      0.222222
true_negative_rate (specificity)         0.777778
positive_predictive_value (precision)    0.982301
negative_predictive_value                1.000000
dtype: float64

Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,7.0,2.0,9.0
y=1,3.0,108.0,111.0
Total,10.0,110.0,


prior_error_rate                         0.075000
total_error_rate                         0.041667
true_positive_rate (sensitivity)         0.972973
false_negative_rate                      0.027027
false_positive_rate                      0.222222
true_negative_rate (specificity)         0.777778
positive_predictive_value (precision)    0.981818
negative_predictive_value                0.700000
dtype: float64

Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,5.0,4.0,9.0
y=1,5.0,106.0,111.0
Total,10.0,110.0,


prior_error_rate                         0.075000
total_error_rate                         0.075000
true_positive_rate (sensitivity)         0.954955
false_negative_rate                      0.045045
false_positive_rate                      0.444444
true_negative_rate (specificity)         0.555556
positive_predictive_value (precision)    0.963636
negative_predictive_value                0.500000
dtype: float64

In [68]:
# Create index for holdout set
np.random.seed(1)
train = np.random.rand(len(auto_df)) < 0.7

predictors  = ['weight', 'cylinders', 'year', 'acceleration']
#predictors  = auto_df.columns.drop(['mpg01', 'name'])
X_train = np.array(auto_df[train][predictors])
y_train = np.array(auto_df[train]['mpg01'])
X_test  = np.array(auto_df[~train][predictors])
y_test  = np.array(auto_df[~train]['mpg01'])


# PREDICT
for K in range(1, 21):
    # model
    model = KNeighborsClassifier(n_neighbors=K).fit(preprocessing.scale(X_train), y_train)
    # Predict
    y_pred = model.predict(preprocessing.scale(X_test))
    
    # Confusion table
    display(HTML('<h3>K={}</h3>'.format(K)))
    confusion_mtx = confusion_matrix(y_test, y_pred)
    display(confusion_table(confusion_mtx))
    
    # Classifier stats
    print('total_error_rate: \n' + str(classifier_stats(confusion_mtx)['total_error_rate']))

Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,59.0,7.0,66.0
y=1,2.0,52.0,54.0
Total,61.0,59.0,


total_error_rate: 
0.07499999999999996


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,62.0,4.0,66.0
y=1,7.0,47.0,54.0
Total,69.0,51.0,


total_error_rate: 
0.09166666666666667


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,59.0,7.0,66.0
y=1,2.0,52.0,54.0
Total,61.0,59.0,


total_error_rate: 
0.07499999999999996


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,59.0,7.0,66.0
y=1,3.0,51.0,54.0
Total,62.0,58.0,


total_error_rate: 
0.08333333333333337


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,59.0,7.0,66.0
y=1,2.0,52.0,54.0
Total,61.0,59.0,


total_error_rate: 
0.07499999999999996


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,59.0,7.0,66.0
y=1,3.0,51.0,54.0
Total,62.0,58.0,


total_error_rate: 
0.08333333333333337


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,59.0,7.0,66.0
y=1,2.0,52.0,54.0
Total,61.0,59.0,


total_error_rate: 
0.07499999999999996


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,59.0,7.0,66.0
y=1,2.0,52.0,54.0
Total,61.0,59.0,


total_error_rate: 
0.07499999999999996


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,58.0,8.0,66.0
y=1,2.0,52.0,54.0
Total,60.0,60.0,


total_error_rate: 
0.08333333333333337


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,58.0,8.0,66.0
y=1,2.0,52.0,54.0
Total,60.0,60.0,


total_error_rate: 
0.08333333333333337


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,58.0,8.0,66.0
y=1,2.0,52.0,54.0
Total,60.0,60.0,


total_error_rate: 
0.08333333333333337


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,58.0,8.0,66.0
y=1,2.0,52.0,54.0
Total,60.0,60.0,


total_error_rate: 
0.08333333333333337


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,57.0,9.0,66.0
y=1,2.0,52.0,54.0
Total,59.0,61.0,


total_error_rate: 
0.09166666666666667


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,57.0,9.0,66.0
y=1,2.0,52.0,54.0
Total,59.0,61.0,


total_error_rate: 
0.09166666666666667


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,57.0,9.0,66.0
y=1,2.0,52.0,54.0
Total,59.0,61.0,


total_error_rate: 
0.09166666666666667


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,57.0,9.0,66.0
y=1,2.0,52.0,54.0
Total,59.0,61.0,


total_error_rate: 
0.09166666666666667


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,57.0,9.0,66.0
y=1,2.0,52.0,54.0
Total,59.0,61.0,


total_error_rate: 
0.09166666666666667


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,57.0,9.0,66.0
y=1,2.0,52.0,54.0
Total,59.0,61.0,


total_error_rate: 
0.09166666666666667


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,57.0,9.0,66.0
y=1,2.0,52.0,54.0
Total,59.0,61.0,


total_error_rate: 
0.09166666666666667


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,57.0,9.0,66.0
y=1,2.0,52.0,54.0
Total,59.0,61.0,


total_error_rate: 
0.09166666666666667


## CONCLUSIÓN

Al utilizar la base de datos "Auto" se realizo un analisis de el QDA LDA KNN utilizando unicamente el 60% de la media, obtuvimos que el top de modelos es:


1. LDA
total_error_rate                         0.041667
Este valor es relativamente bajo, lo que indica que el modelo LDA tiene una baja tasa de error global.
false_negative_rate                      0.027027
Esto indica la proporción de casos positivos reales que fueron incorrectamente clasificados como negativos por el modelo. En este caso, es relativamente baja.
positive_predictive_value (precision)    0.981818
Esto representa la proporción de predicciones positivas que fueron correctas. Es bastante alta, lo que indica que cuando el modelo predice positivo, suele ser correcto.

2. QDA
total_error_rate                         0.075000
En comparación con LDA, la tasa de error total en QDA es ligeramente más alta
false_negative_rate                      0.045045
La tasa de falsos negativos es ligeramente más alta en comparación con LDA, lo que significa que se clasificaron incorrectamente un poco más de casos positivos reales como negativos
positive_predictive_value (precision)    0.963636
La precisión sigue siendo bastante alta pero no es mayor a la tasa obtenid en el LDA, lo que indica que QDA es bueno para hacer predicciones positivas precisas.

3. KNN
K1 K3 K5  K7 K8
total_error_rate                          0.07499999999999996
KNN supera la tasa de error de LDA, sin embargo se puede suponer que tiene una tasa de error similar a la del QDA con una tasas de error total del 7,499% para K1, K3, K5, K7, K8

En resumen, LDA y QDA parecen tener una alta precisión, lo que indica que son buenos para realizar predicciones positivas correctas. Sin embargo, LDA tiene una tasa de falsos negativos ligeramente más baja en comparación con QDA, lo que significa que pierde menos casos positivos reales. KNN también se desempeña bien con una tasa de error total baja sin embargo su tasa de error sigue siendo mayor a la de LDA y QDA. Por ello concluimos que el mejor modelo para clasificar los datos es LDA donde las predicciones positivas correctas son el 98% y el menor error con el 4,166%, seguidamente el QDA donde las predicciones positivas corrretas son el 96% y la tasa de error de 7,5%, finalmente el KNN con una tasa de error del 7,499%, a pesar que la tasa de error es menor a la del QDA, en el QDA nos ofrece una tasa de preción alta y un bajo indice de falsos negativos, pero KNN no tenemos esa información detallada por eso esta de ultimas en el top.