# Pontos faciais para reconhecimento

## Bibliotecas necessárias 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pathlib
import random

### Diretório do arquivo

In [16]:
data_dir = pathlib.Path(r'..\Data\features_rostos_estevam.csv')

In [18]:
test_dir = pathlib.Path(r'..\Data\turing_faces_distances.csv')

In [17]:
data = pd.read_csv(data_dir)

In [19]:
test_data = pd.read_csv(test_dir)

Vamos ver como é nosso arquivo

In [20]:
data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,Nome
0,0,0.260142,0.046183,1.0,0.951472,0.854369,0.519104,0.27754,0.280888,0.432679,...,0.179145,0.16271,-0.056749,0.255016,0.239297,0.375422,0.359321,0.654896,0.302772,Aaron_Peirsol
1,1,0.254773,0.058447,1.0,0.95797,0.832973,0.495182,0.283555,0.246511,0.3924,...,0.196968,0.177396,-0.057979,0.23974,0.219459,0.368476,0.328212,0.669961,0.280692,Aaron_Peirsol
2,2,0.253177,0.047306,1.0,0.944648,0.844557,0.52219,0.289724,0.287694,0.450007,...,0.184831,0.179927,-0.061107,0.250068,0.225604,0.363262,0.344334,0.626515,0.295239,Aaron_Peirsol
3,3,0.237653,0.043372,1.0,0.959052,0.829501,0.493206,0.255357,0.292196,0.351102,...,0.212722,0.21085,-0.059389,0.268917,0.256376,0.357886,0.388767,0.751863,0.367562,Aaron_Peirsol
4,4,0.252994,0.034398,1.0,0.986235,0.852079,0.482434,0.385506,0.315833,0.442231,...,0.134067,0.146044,-0.052632,0.258137,0.239913,0.365022,0.321053,0.646611,0.281889,Aaron_Sorkin


In [23]:
len(data.Nome.value_counts())

1680

Vemos que ele é desbalanceado em termos de pessoas

In [22]:
data.shape

(9164, 32)

Temos 9164 imagens de 1680 pessoas diferentes, com 30 características

In [24]:
data.columns

Index(['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', 'Nome'],
      dtype='object')

In [25]:
data = data.drop('Unnamed: 0', axis=1)

In [26]:
data.isnull().values.any()

True

In [32]:
data = data.dropna()

(8238, 31)

## Abordagem

A ideia inicial aqui será criar novas features, cada nova feature será o módulo da subtração das distâncias

In [28]:
cols=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', 'Same']
df = pd.DataFrame(columns=cols)

### Vamos pegar os nomes únicos 

Pegando os nomes únicos podemos transformá los em valores numéricos para usar de Label

In [30]:
names = pd.unique(data['Nome'])

In [34]:
encoded_data = data.copy() 
uniques = len(data.Nome.value_counts()) #numero de nomes únicos
encoded_data['Nome'].replace(names, np.arange(uniques), inplace=True) #Tokenizando os nomes

In [35]:
npdata = encoded_data.to_numpy()
npdata

array([[2.60142002e-01, 4.61829830e-02, 1.00000000e+00, ...,
        6.54895569e-01, 3.02771735e-01, 0.00000000e+00],
       [2.54773110e-01, 5.84466762e-02, 1.00000000e+00, ...,
        6.69961043e-01, 2.80692127e-01, 0.00000000e+00],
       [2.53176901e-01, 4.73057713e-02, 1.00000000e+00, ...,
        6.26514952e-01, 2.95238825e-01, 0.00000000e+00],
       ...,
       [2.47418602e-01, 6.45187672e-02, 1.00000000e+00, ...,
        6.85403163e-01, 2.98218311e-01, 1.66900000e+03],
       [2.49118779e-01, 6.23825972e-02, 1.00000000e+00, ...,
        7.25303734e-01, 2.88254858e-01, 1.66900000e+03],
       [2.40977548e-01, 3.46110289e-02, 1.00000000e+00, ...,
        6.43695094e-01, 2.79411246e-01, 1.66900000e+03]])

In [36]:
#Garantindo que não temos linhas com distâncias nulas, e deletando caso houverem

for i in range(len(npdata)):
    if np.count_nonzero(npdata[i]) < 5:
        npdata = np.delete(npdata, i, 0)
        print("Deleted row", i)

In [37]:
npdata.shape

(8238, 31)

## Gerando o dataset 

A ideia aqui é fazer a diferença das distâncias faciais entre pessoas, e temos como parâmetros o número de pessoas diferentes que usaremos para 

In [40]:
different = 11
equals = 38
shape = (npdata.shape[0]*(different+1), npdata.shape[1])
npdf = np.zeros(shape)
i = 0
for first_row in (range(0, npdata.shape[0])):
    numbers = np.array(random.sample(range(0,npdata.shape[0]),different))
    numbers=numbers[numbers!=first_row]
    equal = first_row+1
    try:
        while (npdata[first_row][-1] == npdata[equal][-1]) and (equal < first_row+equals):
            numbers = np.append(numbers, equal)
            equal+=1
        for second_row in numbers:
            npdf[i] = np.absolute(npdata[first_row] - npdata[second_row])
            npdf[i][30] = int(npdf[i][30] == 0) #1 se corresponder a uma mesma pessoa
            i += 1
    except IndexError:
        pass
print(i)

98856


In [41]:
npdf.shape

(98856, 31)

Garantir que não tem linhas nulas

In [42]:
lenght = len(npdf)
print(lenght)
i=0
while i < lenght:
    if np.count_nonzero(npdf[i]) < 20:
        npdf = np.delete(npdf, i, 0)
        print("Deleted row", i)
        lenght-=1
        i-=1
    i+=1

98856


In [43]:
npdf.shape

(98856, 31)

In [44]:
df = pd.DataFrame(npdf, columns=cols)
print(df.Same.sum()/npdf.shape[0]) #calcula a razão de distâncias entre mesmas pessoas e pessoas diferentes
                                   #o ideal é estar próximo de 50%

0.5251578052925467


In [45]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,Same
0,0.011618,0.006596,0.0,0.031828,0.030364,0.042268,0.00501,0.019889,0.046643,0.013459,...,0.066967,0.045196,0.022176,0.024266,0.02644,0.014271,0.030155,0.137545,0.007001,0.0
1,0.029422,0.019836,0.0,0.029929,0.001777,0.023209,0.007608,0.033968,0.028381,0.000312,...,0.016039,0.010404,0.000818,0.033833,0.014483,0.039057,0.008691,0.103715,0.073112,0.0
2,0.027715,0.003696,0.0,0.032814,0.008061,0.015054,0.060986,0.005203,0.107435,0.008372,...,0.029074,0.002087,0.003232,0.045191,0.039699,0.04176,0.016461,0.146019,0.092469,0.0
3,0.030032,0.012839,0.0,0.001186,0.032717,0.022119,0.00938,0.031991,0.08272,0.010181,...,0.075512,0.045129,0.009501,0.010834,0.013079,0.1299,0.047161,0.035075,0.09581,0.0
4,0.019439,0.001905,0.0,0.002812,0.048333,0.035985,0.067917,0.026699,0.068736,0.005116,...,0.011556,0.027337,0.005954,0.000767,0.00808,0.027257,0.028173,0.112936,0.081054,0.0
5,0.00955,0.005331,0.0,0.018889,0.052916,0.033659,0.011292,0.042594,0.032687,0.020805,...,0.054443,0.10792,0.022037,0.011406,0.010179,0.029,0.008177,0.080065,0.072329,0.0
6,0.01489,0.005718,0.0,0.026704,0.003878,0.024766,0.07305,0.043173,0.076385,0.000357,...,0.001069,0.027528,0.011568,0.000156,0.000643,0.047539,0.018716,0.014801,0.069034,0.0
7,0.036867,0.004528,0.0,0.009381,0.041931,0.011583,0.052277,0.030605,0.051092,0.008135,...,0.043212,0.039811,0.008708,0.04294,0.028265,0.001403,0.047856,0.128688,0.035186,0.0
8,0.00209,0.017809,0.0,0.019129,0.033702,0.00773,0.061032,0.017562,0.065457,0.006626,...,0.020322,0.039247,0.011038,0.03577,0.011425,0.002717,0.042745,0.166522,0.030239,0.0
9,0.029701,0.010335,0.0,0.050843,0.016142,0.051884,0.044677,0.055251,0.055342,0.017585,...,0.044068,0.040511,0.012645,0.000358,0.009511,0.076452,0.017252,0.006989,0.141375,0.0


In [None]:
df.to_csv('facial_differences.csv', index=False)

## Modelo 

Dividir em treino e teste

In [46]:
from sklearn.model_selection import train_test_split

train_y = df.Same
train_x = df.drop(['Same'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.25, random_state=42)

### Tentativa inicial com regressão logística 

In [47]:
from sklearn.linear_model import LogisticRegression

lrc = LogisticRegression().fit(X_train, y_train)
lrc.score(X_test, y_test)

0.7108116856842276

### Tentativa com Gradient Boosting 

In [50]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
stdX_train = scaler.transform(X_train)
stdX_test = scaler.transform(X_test)

In [28]:
from sklearn import ensemble
original_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}
params=dict(original_params)
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=None,
                           max_features=None, max_leaf_nodes=4,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=5,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=2, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [29]:
clf.score(X_test, y_test)

0.7529675072938732

In [None]:
from sklearn import ensemble
original_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}
params=dict(original_params)
std_clf = ensemble.GradientBoostingClassifier(**params)
std_clf.fit(stdX_train, y_train)

In [None]:
std_clf.score(stdX_test, y_test)

### Salvando o modelo em um pickle 

<img src="https://i.ytimg.com/vi/tZp8sY06Qoc/maxresdefault.jpg" width=50%>

In [56]:
import pickle
#save the model to disk
filename = 'gradient_boosting_faces_clf.sav'
pickle.dump(clf, open(filename, 'wb'))

Se quiser carregar o modelo salvo, descomentar o código abaixo

#load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

## Testando com o dataset do Turing 

In [44]:
test_data.columns

Index(['Unnamed: 0', 'nome', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21',
       '22', '23', '24', '25', '26', '27', '28', '29'],
      dtype='object')

In [45]:
test_data = test_data.drop('Unnamed: 0', axis=1)

In [46]:
test_data.nome.value_counts()

Edu       4
Azank     3
Mura      2
Cola      2
Enzo      2
Paulo     2
Ariel     2
Noel      2
Nelson    2
Fill      2
Luisa     2
Gui       2
Camila    2
Koji      1
Victor    1
Name: nome, dtype: int64

In [47]:
cols=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', 'Same']
df2 = pd.DataFrame(columns=cols)

In [48]:
names2 = pd.unique(test_data['nome'])

In [49]:
encoded_data2 = test_data.copy()
encoded_data2['nome'].replace(names2, np.arange(15), inplace=True)

In [50]:
npdata2 = encoded_data2.to_numpy()
npdata2

array([[ 0.        ,  0.25428366,  0.03542707,  1.        ,  0.97753248,
         0.85326208,  0.52004198,  0.34309569,  0.30581136,  0.35610583,
         0.05084665,  0.07344516,  0.28253786,  0.13570871,  0.22040789,
         0.37890446,  0.40906189,  0.40369374,  0.27777036,  0.14169193,
         0.92340848,  0.49338408,  0.28143276,  0.28904331, -0.06497071,
         0.28385856,  0.25124176,  0.29312802,  0.29055787,  0.7035968 ,
         0.29812617],
       [ 1.        ,  0.2751719 ,  0.04408635,  1.        ,  0.90957254,
         0.74695803,  0.43989212,  0.21818622,  0.21160113,  0.39760714,
         0.03614196,  0.07830757,  0.19878075,  0.10240221,  0.25299369,
         0.43693777,  0.40197664,  0.39116553,  0.34987197,  0.17478994,
         0.87144679,  0.53934632,  0.16460125,  0.18674958, -0.06324842,
         0.3058034 ,  0.26858402,  0.38706405,  0.38706405,  0.71651187,
         0.27814753],
       [ 1.        ,  0.28399087,  0.04796744,  1.        ,  0.97422697,
       

In [52]:
different = 2
equals = 5
shape = (npdata2.shape[0]*(different+1), npdata2.shape[1])
npdf = np.zeros(shape)
i = 0
for first_row in (range(0, npdata2.shape[0])):
    numbers = np.array(random.sample(range(0,npdata2.shape[0]),different))
    numbers=numbers[numbers!=first_row]
    equal = first_row+1
    while (npdata2[first_row][-1] == npdata2[equal][-1]) and (equal < first_row+equals):
        numbers = np.append(numbers, equal)
        equal+=1
    for second_row in numbers:
        npdf[i] = np.absolute(npdata2[first_row] - npdata2[second_row])
        npdf[i][30] = int(npdf[i][30] == 0) #1 se corresponder a uma mesma pessoa
        i += 1
print(i)

IndexError: index 31 is out of bounds for axis 0 with size 31

In [None]:
shape = (npdata2.shape[0]*(different+1), npdata2.shape[1])
npdf = np.zeros(shape)
i = 0
for first_row in (range(0, npdata2.shape[0])):
    for second_row in range(first_row, npdata2.shape[0]):
        npdf[i] = np.absolute(npdata2[first_row] - npdata2[second_row])
        npdf[i][30] = int(npdf[i][30] == 0) #1 se corresponder a uma mesma pessoa
        i += 1
print(i)

In [53]:
df = pd.DataFrame(npdf, columns=cols)

In [54]:
test_y = df.Same
test_x = df.drop(['Same'], axis=1)

In [55]:
clf.score(test_x, test_y)

0.8709677419354839