# Estimando a popularidade de raças de cachorros

O site do [American Kennel Club](https://www.akc.org/) lista 277 raças, com diversas características diferentes. O conjunto de dados contendo todas essas informações pode ser acessado [aqui](https://tmfilho.github.io/akcdata/). Uma das variáveis interessantes é a popularidade da raça entre os americanos. Nesta aplicação, iremos predizer a popularidade das raças que não têm essa informação. Vamos começar importando a biblioteca pandas para ler os dados.

In [1]:
import pandas as pd

In [33]:
data = pd.read_csv('akc-data-latest-no-text.csv', index_col=0) 
data

Unnamed: 0,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_value,shedding_value,energy_level_value,trainability_value,demeanor_value
Affenpinscher,148,22.86,29.21,3.175147,4.535924,12.0,15.0,0.6,0.6,0.6,0.8,1.0
Afghan Hound,113,63.50,68.58,22.679619,27.215542,12.0,15.0,0.8,0.2,0.8,0.2,0.2
Airedale Terrier,60,58.42,58.42,22.679619,31.751466,11.0,14.0,0.6,0.4,0.6,1.0,0.8
Akita,47,60.96,71.12,31.751466,58.967008,10.0,13.0,0.8,0.6,0.8,1.0,0.6
Alaskan Malamute,58,58.42,63.50,34.019428,38.555351,10.0,14.0,0.6,0.6,0.8,0.4,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...
Wirehaired Vizsla,167,54.61,63.50,20.411657,29.483504,12.0,14.0,0.2,0.6,0.8,0.6,0.6
Working Kelpie,,48.26,63.50,12.700586,27.215542,12.0,15.0,0.2,0.6,0.8,0.4,0.6
Xoloitzcuintli,140,25.40,58.42,4.535924,24.947580,13.0,18.0,0.2,0.2,0.8,0.6,0.6
Yakutian Laika,,53.34,58.42,18.143695,24.947580,10.0,12.0,0.4,0.6,0.8,0.2,0.4


## Separando as raças que não têm popularidade

In [34]:
final_test = data[
    data['popularity'].isna()
].drop(
    'popularity', axis=1
).dropna()

final_test

Unnamed: 0,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_value,shedding_value,energy_level_value,trainability_value,demeanor_value
American Bulldog,50.8,63.5,27.215542,45.359237,10.0,12.0,0.2,0.6,0.8,0.6,0.6
American Leopard Hound,53.34,68.58,20.411657,31.751466,12.0,15.0,0.4,0.6,0.8,1.0,0.6
Appenzeller Sennenhund,48.26,55.88,21.772434,31.751466,12.0,15.0,0.4,0.8,1.0,0.2,0.4
Australian Stumpy Tail Cattle Dog,43.18,50.8,14.514956,20.411657,12.0,15.0,0.4,0.4,0.8,0.8,0.6
Azawakh,59.69,73.66,14.968548,24.94758,12.0,15.0,0.4,0.4,0.6,0.4,0.8
Barbet,48.26,62.23,15.875733,29.483504,12.0,14.0,0.4,0.2,0.6,0.6,0.8
Basset Fauve de Bretagne,31.75,39.37,12.246994,15.875733,12.0,12.0,0.4,0.6,0.6,0.2,1.0
Bavarian Mountain Scent Hound,43.18,52.07,16.782918,29.937096,12.0,15.0,0.4,0.8,0.6,0.6,0.4
Belgian Laekenois,55.88,66.04,24.94758,29.483504,10.0,12.0,0.4,0.8,0.8,0.8,0.4
Bohemian Shepherd,49.022,55.118,16.782918,27.215542,12.0,15.0,0.2,0.4,0.6,0.8,0.6


In [35]:
final_test.shape

(50, 11)

## Ficando com as que têm popularidade

In [36]:
fit_data = data.dropna()
fit_data

Unnamed: 0,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_value,shedding_value,energy_level_value,trainability_value,demeanor_value
Affenpinscher,148,22.86,29.21,3.175147,4.535924,12.0,15.0,0.6,0.6,0.6,0.8,1.0
Afghan Hound,113,63.50,68.58,22.679619,27.215542,12.0,15.0,0.8,0.2,0.8,0.2,0.2
Airedale Terrier,60,58.42,58.42,22.679619,31.751466,11.0,14.0,0.6,0.4,0.6,1.0,0.8
Akita,47,60.96,71.12,31.751466,58.967008,10.0,13.0,0.8,0.6,0.8,1.0,0.6
Alaskan Malamute,58,58.42,63.50,34.019428,38.555351,10.0,14.0,0.6,0.6,0.8,0.4,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...
Wire Fox Terrier,101,38.10,38.10,6.803886,8.164663,12.0,15.0,0.4,0.2,0.6,0.6,0.6
Wirehaired Pointing Griffon,65,50.80,60.96,15.875733,31.751466,12.0,15.0,0.4,0.6,1.0,0.6,0.6
Wirehaired Vizsla,167,54.61,63.50,20.411657,29.483504,12.0,14.0,0.2,0.6,0.8,0.6,0.6
Xoloitzcuintli,140,25.40,58.42,4.535924,24.947580,13.0,18.0,0.2,0.2,0.8,0.6,0.6


## Tratando a popularidade

In [43]:
fit_data = fit_data.astype({'popularity': 'float'})

In [47]:
(
    fit_data['popularity'] - fit_data['popularity'].min()
)/(
    fit_data['popularity'].max() - fit_data['popularity'].min()
)

Affenpinscher                  0.769634
Afghan Hound                   0.586387
Airedale Terrier               0.308901
Akita                          0.240838
Alaskan Malamute               0.298429
                                 ...   
Wire Fox Terrier               0.523560
Wirehaired Pointing Griffon    0.335079
Wirehaired Vizsla              0.869110
Xoloitzcuintli                 0.727749
Yorkshire Terrier              0.047120
Name: popularity, Length: 186, dtype: float64

In [48]:
1 - (
    fit_data['popularity'] - fit_data['popularity'].min()
)/(
    fit_data['popularity'].max() - fit_data['popularity'].min()
)

Affenpinscher                  0.230366
Afghan Hound                   0.413613
Airedale Terrier               0.691099
Akita                          0.759162
Alaskan Malamute               0.701571
                                 ...   
Wire Fox Terrier               0.476440
Wirehaired Pointing Griffon    0.664921
Wirehaired Vizsla              0.130890
Xoloitzcuintli                 0.272251
Yorkshire Terrier              0.952880
Name: popularity, Length: 186, dtype: float64

In [49]:
fit_data['popularity'] = 1 - (
    fit_data['popularity'] - fit_data['popularity'].min()
)/(
    fit_data['popularity'].max() - fit_data['popularity'].min()
)

## Checando as correlações

In [51]:
fit_data.corr(method='spearman')

Unnamed: 0,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_value,shedding_value,energy_level_value,trainability_value,demeanor_value
popularity,1.0,-0.034997,-0.057059,-0.04235,0.002815,-0.159458,-0.084035,0.183683,0.118087,-0.063101,0.256293,0.258926
min_height,-0.034997,1.0,0.962341,0.898838,0.891899,-0.540044,-0.581008,-0.181808,0.299643,0.272699,-0.051638,-0.230024
max_height,-0.057059,0.962341,1.0,0.857562,0.889531,-0.51717,-0.565272,-0.219259,0.314628,0.313358,-0.025432,-0.252771
min_weight,-0.04235,0.898838,0.857562,1.0,0.954402,-0.572098,-0.644762,-0.105255,0.407667,0.186359,-0.103066,-0.234549
max_weight,0.002815,0.891899,0.889531,0.954402,1.0,-0.566763,-0.622508,-0.096181,0.395661,0.207754,-0.036281,-0.232278
min_expectancy,-0.159458,-0.540044,-0.51717,-0.572098,-0.566763,1.0,0.718013,0.009273,-0.263965,-0.005199,0.079176,0.130626
max_expectancy,-0.084035,-0.581008,-0.565272,-0.644762,-0.622508,0.718013,1.0,0.0908,-0.410576,-0.110139,0.112297,0.142305
grooming_frequency_value,0.183683,-0.181808,-0.219259,-0.105255,-0.096181,0.009273,0.0908,1.0,-0.011768,-0.173028,0.106115,0.10131
shedding_value,0.118087,0.299643,0.314628,0.407667,0.395661,-0.263965,-0.410576,-0.011768,1.0,0.212093,0.08373,-0.082835
energy_level_value,-0.063101,0.272699,0.313358,0.186359,0.207754,-0.005199,-0.110139,-0.173028,0.212093,1.0,0.141079,0.033357


## Normalizando os dados

In [72]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
fit_data_Z = ss.fit_transform(fit_data.drop('popularity', axis=1))
fit_data_Z

array([[-1.35302301, -1.27970166, -1.12998304, ..., -0.73120002,
         0.72154105,  1.80427232],
       [ 1.34260643,  1.05166004,  0.44478684, ...,  0.5280889 ,
        -1.79483337, -2.21482531],
       [ 1.00565275,  0.45001831,  0.44478684, ..., -0.73120002,
         1.56033253,  0.79949792],
       ...,
       [ 0.75293749,  0.75083917,  0.26167407, ...,  0.5280889 ,
        -0.11725042, -0.20527649],
       [-1.18454617,  0.45001831, -1.02011538, ...,  0.5280889 ,
        -0.11725042, -0.20527649],
       [-1.68997669, -1.80613818, -1.12998304, ..., -0.73120002,
        -1.79483337,  0.79949792]])

In [73]:
X, y = fit_data_Z, fit_data['popularity'].values

## Treinando um modelo

Nós poderíamos aqui simplesmente ajustar uma regressão linear e predizer os valores para o conjunto de raças que não tem popularidade registrada:

In [84]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X, y)
lr.score(X,y)

0.19146691886220124

In [85]:
final_test_Z = ss.transform(final_test)

predictions = lr.predict(final_test_Z)

pd.DataFrame(
    zip(final_test.index, predictions),
    columns=['raça', 'popularidade']
)

Unnamed: 0,raça,popularidade
0,American Bulldog,0.484022
1,American Leopard Hound,0.495058
2,Appenzeller Sennenhund,0.360731
3,Australian Stumpy Tail Cattle Dog,0.457716
4,Azawakh,0.38012
5,Barbet,0.422122
6,Basset Fauve de Bretagne,0.487607
7,Bavarian Mountain Scent Hound,0.499456
8,Belgian Laekenois,0.45149
9,Bohemian Shepherd,0.465707


## Escolhendo o melhor modelo

Mas como saber se o modelo que ajustamos acima é o melhor que podemos fazer? O correto é realizar um experimento, como vimos na aula sobre avaliação de modelos. Aí podemos escolher o melhor método para realizar nossa tarefa.

In [87]:
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error

In [75]:
models = {
      'lr': LinearRegression,
      'svr': SVR,
      'dt' : DecisionTreeRegressor,
      'rf' : RandomForestRegressor,
      'ada' : AdaBoostRegressor
}

In [89]:
results = []
mc = 10
folds = 5

for mc_iteration in tqdm(range(mc)):
    kf = KFold(n_splits=folds, shuffle=True, random_state=mc_iteration)
    fold = 0
    for train_index, test_index in kf.split(X, y):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        for model in models:
            m = models[model]().fit(X_train, y_train)
            pred = m.predict(X_test)
            err = mean_absolute_error(y_test, pred)
            results.append([mc_iteration, fold, model, err])
        fold += 1

res_df = pd.DataFrame(results, columns=['mc', 'fold', 'model', 'MAE'])
res_df

100%|██████████| 10/10 [00:08<00:00,  1.13it/s]


Unnamed: 0,mc,fold,model,MAE
0,0,0,lr,0.245096
1,0,0,svr,0.233300
2,0,0,dt,0.248691
3,0,0,rf,0.225050
4,0,0,ada,0.240927
...,...,...,...,...
245,9,4,lr,0.224716
246,9,4,svr,0.261215
247,9,4,dt,0.332814
248,9,4,rf,0.223269


### Agregando os resultados

Com os resultados agregados, podemos escolher o melhor método.

In [90]:
pd.pivot_table(res_df, index='model', values='MAE', aggfunc=['mean', 'std'])

Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,MAE,MAE
model,Unnamed: 1_level_2,Unnamed: 2_level_2
ada,0.234818,0.019608
dt,0.292549,0.038449
lr,0.244212,0.020202
rf,0.232816,0.020639
svr,0.243563,0.023419


Com o melhor modelo escolhido, podemos ajustá-lo com todos os dados.

In [91]:
final_model = RandomForestRegressor().fit(X, y)

Aíusamos nosso melhor método para predizer a popularidade das raças que não têm essa informação.

In [92]:
final_predictions = final_model.predict(final_test_Z)
pd.DataFrame(
    zip(final_test.index, final_predictions),
    columns=['raça', 'popularidade']
)

Unnamed: 0,raça,popularidade
0,American Bulldog,0.299424
1,American Leopard Hound,0.593887
2,Appenzeller Sennenhund,0.437513
3,Australian Stumpy Tail Cattle Dog,0.383822
4,Azawakh,0.38801
5,Barbet,0.588743
6,Basset Fauve de Bretagne,0.518325
7,Bavarian Mountain Scent Hound,0.354319
8,Belgian Laekenois,0.415126
9,Bohemian Shepherd,0.381466
