# Optical Character Recognition

## An data overview (1ST step)

In [None]:
import pandas as pd
all_data = pd.read_csv('mnist_avaliacao.csv', sep=',')
test_data = pd.read_csv('mnist_teste.csv', sep=',')

In [None]:
# overview types
all_data.dtypes.value_counts()

In [None]:
# take a look at the last columns 
all_data.dtypes.tail()

In [None]:
# lets take a look at the 'class' balance. I had to rename to 'y_class', because class is already in use in python 
all_data.y_class

In [None]:
# take a look at the minimum value avaible
all_data.iloc[:, :-1].min().value_counts()

In [None]:
# take a look at the maximum value avaible
all_data.iloc[:, :-1].max().value_counts()

In [None]:
# lets overview values
all_data.describe()

In [None]:
# lets recover 1 line and try to draw
import seaborn as sns
import matplotlib.pyplot as plt
from work_methods import matrix_recover

# choose a number between 1 and 2050 to draw
to_recover = 5
square_reference = 28

# recover a matrix from data to plot
matrix = matrix_recover(all_data, to_recover, square_reference)

# heatmap plot of matrix
plt.figure(figsize=(28, 28))
sns.heatmap(matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.show()

## Use the classifier methods without any input filter (2ND step)

### Global checkpoint

In [None]:
# python libraries
import optuna
import pandas as pd 

# python codes
from resultado import Fold
from avaliacao import Experimento
from metodo import ScikitLearnAprendizadoDeMaquina

file_name = 'mnist_avaliacao.csv'

# define classes
numbers_names = ['0','1','2','3','4','5','6','7','8','9']
numbers_values = [x for x in range(0,10,1)]
numbers = dict(zip(numbers_values,numbers_names))

# recover DataFrame
df_data = pd.DataFrame(pd.read_csv(file_name))

#gera o fold e p experimento
folds = Fold.gerar_k_folds(df_data,val_k=5,col_classe='y_class',
                            num_repeticoes=1,seed=1,
                            num_folds_validacao=3,num_repeticoes_validacao=2)

### Random Forrest Classifier 28x28

In [None]:
from sklearn.ensemble import RandomForestClassifier
from avaliacao import OtimizacaoObjetivoRandomForest

clf_rf = RandomForestClassifier(random_state=1)
ml_method = ScikitLearnAprendizadoDeMaquina(clf_rf)
exp = Experimento(folds,ml_method, OtimizacaoObjetivoRandomForest, num_trials=10,
                    sampler=optuna.samplers.TPESampler(seed=1, n_startup_trials=3))
exp.calcula_resultados()

In [None]:
studdy_fold_0 = exp.studies_per_fold[0]
studdy_fold_0.trials_dataframe().sort_values("value",ascending=False)

In [None]:
from work_methods import parameters_graph
parameters_graph(exp.studies_per_fold[0].trials)

In [None]:
from work_methods import show_results
show_results("Random Forest 28x28", numbers, exp)

### Decision Tree Classifier 28x28

In [None]:
from sklearn.tree import DecisionTreeClassifier
from avaliacao import OtimizacaoObjetivoArvoreDecisao

clf_dtree = DecisionTreeClassifier(random_state=1)
ml_method = ScikitLearnAprendizadoDeMaquina(clf_dtree)
exp = Experimento(folds,ml_method, OtimizacaoObjetivoArvoreDecisao, num_trials=10,
                    sampler=optuna.samplers.TPESampler(seed=1, n_startup_trials=3))
exp.calcula_resultados()

In [None]:
studdy_fold_0 = exp.studies_per_fold[0]
studdy_fold_0.trials_dataframe().sort_values("value",ascending=False)

In [None]:
parameters_graph(exp.studies_per_fold[0].trials)

In [None]:
show_results("Decision Tree 28x28", numbers, exp)

## Use infogain as input filter (3RD step)

In [None]:
from ganho_informacao import ganho_informacao

# naming adjust
database = all_data

info_gain_database = pd.DataFrame(columns=['Atributo', 'Ganho de Informação'])
for column in database.columns:
    if column != 'y_class':
        info_gain_database.loc[len(info_gain_database)] = {'Atributo': column, 'Ganho de Informação': ganho_informacao(database, 'y_class', column)}

print(info_gain_database)

In [None]:
from work_methods import info_gain_matrix_recover

squere_reference = 28

info_gain_matrix = info_gain_matrix_recover(info_gain_database, square_reference)

# heatmap
plt.figure(figsize=(29, 29))
sns.heatmap(info_gain_matrix, annot=True, cmap="YlGnBu")
plt.show()

A informação obtida pelo info_gain sugere que as primeira 4 linhas, as primeiras 4 colunas, as últimas 4 linhas e as últimas 4 colunas não são tão expressivas na determinação do dígito. Isso acontece porque as amostras coletadas não fizeram bom uso do espaço disponível tornando os pixels dos cantos irrelevantes. Poderia ser feito um corte mais interessante também arredondando as bordas da matriz, mas isso implicaria em uma matriz deformada. Então essa ideia não será considerada. O plano consiste em apenas fazer as eliminações das linhas e colunas sugeridas, ocorre uma redução de 384 atributos. Antes desse filtro, o total era de 784. Com a eliminação, esse número cai para 400. É uma redução aproximada de 49% dos atributos relevantes e ainda mantém a matriz quadrada.

In [None]:
# remove features
# remove 4 top rows
# remove 4 last rows
# remove 4 left columns
# remove 4 last columns
square_reference = 28
start_rows = 4
start_columns = 4
last_rows = 24
last_columns = 24

# create filter
filtr = np.zeros((28,28))
for i in range(start_rows, last_rows):
    for j in range(start_columns, last_columns):
        filtr[i][j] = 1

filtr_array = []
for i in range(28):
    for j in range(28):
        filtr_array.append(int(filtr[i][j]))
        
filtr_array.append(2)

In [None]:
#handling file and filter csv
new_file = open('mnist_sample_selecaoFeature.csv', 'w')
with open('mnist_avaliacao.csv', 'r') as file:
    first_row = True
    for line in file:
        row_to_write = ''
        items = line.split(',')
        for i in range(len(items)):
            if filtr_array[i] == 1:
                row_to_write += items[i]+','
            elif filtr_array[i] == 2:
                row_to_write += items[i]
        nan = False
        if not first_row:
            for item in row_to_write:
                if not item.isnumeric() and not item.isalpha() and item!=',' and item!='\n':
                    nan = True
        if row_to_write.count(',') == 400 and not nan:
            new_file.write(row_to_write)
        first_row = False
new_file.close()

In [None]:
database = pd.read_csv('mnist_sample_selecaoFeature.csv', sep=',')

info_gain_database = pd.DataFrame(columns=['Atributo', 'Ganho de Informação'])
for column in database.columns:
    if column != 'y_class':
        info_gain_database.loc[len(info_gain_database)] = {'Atributo': column, 'Ganho de Informação': ganho_informacao(database, 'y_class', column)}

print(info_gain_database)

In [None]:
squere_reference = 20

info_gain_matrix = info_gain_matrix_recover(info_gain_database, square_reference)

# heatmap
plt.figure(figsize=(20, 20))
sns.heatmap(info_gain_matrix, annot=True, cmap="YlGnBu")
plt.show()

### Global Checkpoint

In [None]:
file_name = 'mnist_sample_selecaoFeature.csv'

# recover DataFrame
df_data = pd.DataFrame(pd.read_csv(file_name))

#gera o fold e p experimento
folds = Fold.gerar_k_folds(df_data,val_k=5,col_classe='y_class',
                            num_repeticoes=1,seed=1,
                            num_folds_validacao=3,num_repeticoes_validacao=2)

### Random Forrest Classifier 20x20

In [None]:
clf_rf = RandomForestClassifier(random_state=1)
ml_method = ScikitLearnAprendizadoDeMaquina(clf_rf)
exp = Experimento(folds,ml_method, OtimizacaoObjetivoRandomForest, num_trials=10,
                    sampler=optuna.samplers.TPESampler(seed=1, n_startup_trials=3))
exp.calcula_resultados()

In [None]:
studdy_fold_0 = exp.studies_per_fold[0]
studdy_fold_0.trials_dataframe().sort_values("value",ascending=False)

In [None]:
parameters_graph(exp.studies_per_fold[0].trials)

In [None]:
show_results("Random Forest 20x20", numbers, exp)

### Decision Tree Classifier 20x20

In [None]:
clf_dtree = DecisionTreeClassifier(random_state=1)
ml_method = ScikitLearnAprendizadoDeMaquina(clf_dtree)
exp = Experimento(folds,ml_method, OtimizacaoObjetivoArvoreDecisao, num_trials=10,
                    sampler=optuna.samplers.TPESampler(seed=1, n_startup_trials=3))
exp.calcula_resultados()

In [None]:
studdy_fold_0 = exp.studies_per_fold[0]
studdy_fold_0.trials_dataframe().sort_values("value",ascending=False)

In [None]:
parameters_graph(exp.studies_per_fold[0].trials)

In [None]:
show_results("Decision Tree 20x20", numbers, exp)

## Transform Feature as input filter (4TH step)