# Reconocimiento de dígitos con KNN



Definir los path al ejecutable de python 3.6 y sus librerías,
de acuerdo al virtual env que estén corriendo.

In [None]:
!cd .. && mkdir build
!cd ../build/ && rm -rf *
!cd ../build && cmake \
  -DPYTHON_EXECUTABLE="$(which python)" \
  -DCMAKE_BUILD_TYPE=Release ..
!cd ../build && make install

# Verifico la correcta instalación. Si no falla el import está OK
!pwd
!python --version


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import metnum as mt
import datetime
import os
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
from numpy import savetxt
from sklearn.metrics import accuracy_score

#percentage over total of train cases
PERCENTAGE_OF_TRAIN_CASES = 0.8
#neighbors for finding the mode in KNN
N_NEIGHBORS = 100
#components for PCA
N_COMPONENTS=40
#number of iterations to find eigenvalues and eigenvectors in power iteration
N_ITERATIONS=5000
#epsilon for power iteration
EPSILON=1e-10
#directory for saving matrix files
timestamp = datetime.now().strftime("%m_%d_%H_%M_%S")
DIRECTORY_NAME="{}".format(timestamp)
os.makedirs(DIRECTORY_NAME+"/", exist_ok=True)

%load_ext autoreload
%autoreload 2

df_train = pd.read_csv("../data/train.csv")

TOTAL_TRAIN_CASES = int(PERCENTAGE_OF_TRAIN_CASES*len(df_train))

#shuffle the train cases.
df_train = df_train[0: TOTAL_TRAIN_CASES].sample(frac=1)

# Uso values para mandar todo a arrays de numpy
X = df_train[df_train.columns[1:]].values
y = df_train["label"].values.reshape(-1, 1)

limit = int(0.8 * X.shape[0]) 

X_train, y_train = X[:limit], y[:limit]
X_val, y_val = X[limit:], y[limit:]

assert len(X_train) == len(y_train)
assert len(X_val) == len(y_val)

print(f"Ahora tengo {len(X_train)} instancias de entrenamiento y {len(X_val)} de validación")

### Opcional: guardamos las matrices de entrenamiento en un archivo si queremos reproducir el experimento

In [None]:
savetxt('{}/X_train.csv'.format(DIRECTORY_NAME), X_train, delimiter=',')
savetxt('{}/y_train.csv'.format(DIRECTORY_NAME), y_train, delimiter=',')
savetxt('{}/X_val.csv'.format(DIRECTORY_NAME), X_val, delimiter=',')
savetxt('{}/y_val.csv'.format(DIRECTORY_NAME), y_val, delimiter=',')

## Testing sin PCA (raw data)

### Entrenamos el dato con los test cases sin modificar y probamos su accuracy

In [None]:
%%time

clf = mt.KNNClassifier(N_NEIGHBORS)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_val[0:])
acc = accuracy_score(y_val, y_pred)
print("Accuracy: {}".format(acc))


### Ejemplo para probar la capacidad del modelo a mano.

In [None]:
random_example= 598

img = X_val[random_example].reshape(28, 28)
plt.imshow(img, cmap="Greys")

print("Prediction: {} - Digit: {}".format( int(y_pred[random_example]), int(y_val[random_example])))

## Testing con PCA

### Usamos PCA para transformar la data

In [None]:
%%time
pca = mt.PCA(N_COMPONENTS, N_ITERATIONS, EPSILON)
pca.fit(X_train)

X_train_pca, X_val_pca = pca.transform(X_train), pca.transform(X_val)


### Opcional: guardamos las matrices de entrenamiento en un archivo si queremos reproducir el experimento

In [None]:
savetxt('{}/X_train_pca.csv'.format(DIRECTORY_NAME), X_train_pca, delimiter=',')
savetxt('{}/X_val_pca.csv'.format(DIRECTORY_NAME), X_val_pca, delimiter=',')

### Entrenamos el modelo con la data de PCA y testeamos accuracy

In [None]:
%%time
clf = mt.KNNClassifier(N_NEIGHBORS)

clf.fit(X_train_pca, y_train)

y_pred = clf.predict(X_val_pca[0:])
acc = accuracy_score(y_val, y_pred)
print("Accuracy: {}".format(acc))

# Testeo automático moviendo parámetros

In [None]:
import time
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import metnum as mt
import datetime
import os
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
from numpy import savetxt
from sklearn.metrics import *
from IPython.display import display, HTML

def calculate_metrics(y_val, y_pred, time):
    
    acc = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average=None)
    recall = recall_score(y_val, y_pred, average=None)
    f1 = f1_score(y_val, y_pred, average=None)
    kappa = -1#kappa_score(y_val, y_pred, average=None)
    
    return (acc, precision, recall, 
            f1, kappa, time)

def pca_experiment_generic(variables):
    
    for variable in variables:
        name = variable["name"]
        value = variable["value"]
        if(name=="PERCENTAGE_OF_TRAIN_CASES"):
            PERCENTAGE_OF_TRAIN_CASES = value
        elif (name=="N_NEIGHBORS"):
            N_NEIGHBORS = value
        elif (name=="N_COMPONENTS"):
            N_COMPONENTS = value
        elif (name == "N_ITERATIONS"):
            N_ITERATIONS = value
        elif(name == "EPSILON"):
            EPSILON = value
            
    print("{} : {}\n{} : {}\n{} : {}\n{} : {}\n{} : {}\n".format("PERCENTAGE_OF_TRAIN_CASES",PERCENTAGE_OF_TRAIN_CASES,
                                                                 "N_NEIGHBORS",N_NEIGHBORS,
                                                                 "N_COMPONENTS",N_COMPONENTS,
                                                                 "N_ITERATIONS",N_ITERATIONS,
                                                                 "EPSILON",EPSILON))
    
    return pca_experiment(PERCENTAGE_OF_TRAIN_CASES, 
                   N_NEIGHBORS, 
                   N_COMPONENTS, 
                   N_ITERATIONS, 
                   EPSILON)
            
    
    
    
def pca_experiment(PERCENTAGE_OF_TRAIN_CASES, 
                   N_NEIGHBORS, 
                   N_COMPONENTS, 
                   N_ITERATIONS, 
                   EPSILON):
    
    start = time.time()
    df_train = pd.read_csv("../data/train.csv")

    TOTAL_TRAIN_CASES = int(PERCENTAGE_OF_TRAIN_CASES*len(df_train))
    #shuffle the train cases.
    df_train = df_train[0: TOTAL_TRAIN_CASES].sample(frac=1)

    # Uso values para mandar todo a arrays de numpy
    X = df_train[df_train.columns[1:]].values
    y = df_train["label"].values.reshape(-1, 1)

    limit = int(0.8 * X.shape[0]) 

    X_train, y_train = X[:limit], y[:limit]
    X_val, y_val = X[limit:], y[limit:]
    
    pca = mt.PCA(N_COMPONENTS, N_ITERATIONS, EPSILON)
    pca.fit(X_train)

    X_train_pca, X_val_pca = pca.transform(X_train), pca.transform(X_val)
    
    clf = mt.KNNClassifier(N_NEIGHBORS)

    clf.fit(X_train_pca, y_train)

    y_pred = clf.predict(X_val_pca[0:])
    
    return calculate_metrics(y_val, y_pred, time.time()-start)



def create_df(variable, experiment_results):
    df = pd.DataFrame(columns=[variable, 'ACCURACY', 'TIME'])
    
    for i in range(10):
        df['PRECISION_'+str(i)] = np.NaN
    for i in range(10):
        df['RECALL_'+str(i)] = np.NaN
    for i in range(10):
        df['F1_'+str(i)] = np.NaN
        
    return df.append(experiment_results, sort=False)


def experiment_generic_move_variable(variable_range, fixed_variables):
    
    timestamp = datetime.now().strftime("%m-%d-%H-%M-%S")
    
    experiment_results = []
    
    NAME_OF_VARIABLE = variable_range["name"]
    print("variable range: "+str(variable_range)+"\nFixed variables:"+str(fixed_variables))
    
    for value in variable_range["range"]:
        fixed_variables.append({"name":NAME_OF_VARIABLE, "value":value})
        
        (acc, precision, recall, 
         f1, kappa, execution_time) =  pca_experiment_generic(fixed_variables)
        
        data = {
            NAME_OF_VARIABLE : value,
            'ACCURACY' : acc,
            'TIME' : round(execution_time)
        }
        
        for i in range(10):
            data['PRECISION_'+str(i)] = precision[i]
            data['RECALL_'+str(i)] = recall[i]
            data['F1_'+str(i)] = f1[i]
            
        experiment_results.append(data)
        
    filename='test_'+NAME_OF_VARIABLE+'-{}'.format(time.time())
    
    df_result = create_df(NAME_OF_VARIABLE, experiment_results)  
    df_result.to_csv(filename+'.csv', encoding='utf-8', index=False)
    
    metadata = {
        'TIMESTAMP' : [timestamp]
    }
    
    metadata_columns = ['TIMESTAMP']
    
    for fixed_variable in fixed_variables:
        metadata[fixed_variable["name"]] = fixed_variable["value"]
        metadata_columns.append(fixed_variable["name"])
                
    df_metadata = pd.DataFrame(metadata, columns = metadata_columns)
    df_metadata.to_csv(filename+'-metadata.csv',encoding = 'utf-8', index=False)
    
    display(df_metadata)
    display(df_result)



def experiment_percentage_of_train_cases(PERCENTAGE_OF_TRAIN_CASES_RANGE, 
                                     N_NEIGHBORS, N_COMPONENTS, 
                                     N_ITERATIONS, EPSILON):
    
    timestamp = datetime.now().strftime("%m-%d-%H-%M-%S")
    
    experiment_results = []
    
    for PERCENTAGE_OF_TRAIN_CASES in PERCENTAGE_OF_TRAIN_CASES_RANGE:
        
        (acc, precision, recall, 
         f1, kappa, execution_time) =  pca_experiment(PERCENTAGE_OF_TRAIN_CASES, 
                                                      N_NEIGHBORS, N_COMPONENTS, 
                                                      N_ITERATIONS, EPSILON)
        
        data ={
            'PERCENTAGE_OF_TRAIN_CASES' : PERCENTAGE_OF_TRAIN_CASES,
            'ACCURACY' : acc,
            'TIME' : round(execution_time) 
        }
        
        for i in range(10):
            data['PRECISION_'+str(i)] = precision[i]
            data['RECALL_'+str(i)] = recall[i]
            data['F1_'+str(i)] = f1[i]
            
        experiment_results.append(data)
    
    filename='{}\test_PERCENTAGE_OF_TRAIN_CASES-{}'.format(time.time())
    
    df_result = create_df('PERCENTAGE_OF_TRAIN_CASES', experiment_results)  
    df_result.to_csv(filename+'.csv', encoding='utf-8', index=False)
    
    metadata = {
        'N_NEIGHBORS': [N_NEIGHBORS],
        'N_COMPONENTS': [N_COMPONENTS],
        'N_ITERATIONS': [N_ITERATIONS],
        'EPSILON': [EPSILON],
        'TIMESTAMP' : [timestamp]
    } 
    df_metadata = pd.DataFrame(metadata, 
                               columns = ['N_NEIGHBORS', 'N_COMPONENTS',
                                          'N_ITERATIONS', 'EPSILON', 'TIMESTAMP'])
    df_metadata.to_csv(filename+'-metadata.csv',encoding = 'utf-8', index=False)
    
    display(df_metadata)
    display(df_result)

        
def main():
    #percentage over total of train cases
    PERCENTAGE_OF_TRAIN_CASES = 0.5
    #neighbors for finding the mode in KNN
    N_NEIGHBORS = 100
    #components for PCA
    N_COMPONENTS=40
    #number of iterations to find eigenvalues and eigenvectors in power iteration
    N_ITERATIONS=5000
    #epsilon for power iteration
    EPSILON=1e-10
    
    ##experiment_percentage_of_train_cases(np.linspace(0.1,1,2), N_NEIGHBORS, 
    ##                                 N_COMPONENTS, N_ITERATIONS, EPSILON)
    
    ##experiment_generic_move_variable({"range" : np.linspace(0.1,1,2), 
    ##                                  "name" : "PERCENTAGE_OF_TRAIN_CASES"},
    ##                                [{"name":"N_NEIGHBORS","value":N_NEIGHBORS},
    ##                                 {"name":"N_COMPONENTS","value":N_COMPONENTS},
    ##                                 {"name":"N_ITERATIONS","value":N_ITERATIONS},
    ##                                 {"name":"EPSILON","value":EPSILON}])
    
    experiment_generic_move_variable({"range" : np.arange(5,7,1), 
                                      "name" : "N_COMPONENTS"},
                                    [{"name":"N_NEIGHBORS","value":N_NEIGHBORS},
                                     {"name":"PERCENTAGE_OF_TRAIN_CASES","value":PERCENTAGE_OF_TRAIN_CASES},
                                     {"name":"N_ITERATIONS","value":N_ITERATIONS},
                                     {"name":"EPSILON","value":EPSILON}])
    

main()
