# Best Kaggle Petfinder model
#### added description features, meaningful name, top rescuer ID

In [77]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

sns.set_style('whitegrid')

## Feature Engineering and Train/Test Split

In [203]:
# load datasets
df_train = pd.read_csv('data/raw/train.csv')
df_test = pd.read_csv('data/raw/test.csv')
print(df_train.shape,'\n',df_test.shape)

# stack test and train on top of eachother
df_all = pd.concat(objs=[df_train, df_test], axis=0)

# separate names with fewer than 3 characters (tend to be meaningless)
df_all['Name'] = df_all['Name'].astype(str).apply(lambda x:1 if len(str(x)) > 3 else 0)

# text feature extraction
df_all['Description'].fillna('', inplace=True)
df_all['desc_length'] = df_all['Description'].apply(lambda x: len(x))
df_all['desc_words'] = df_all['Description'].apply(lambda x: len(x.split()))
df_all['avg_word_length'] = (df_all['desc_length'] / df_all['desc_words']).round(1)
df_all['avg_word_length'].fillna(0, inplace=True)

# Loop to transform the categorical columns to numerical
for col in ['State','Type']:
    df_dummies = pd.get_dummies(df_all[col], prefix=col)
    #df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
    df_all = pd.concat([df_all, df_dummies], axis=1)
    # Remove the original columns
    del df_all[col]

# get dummies for only top rescuers
top_rescuer = list(df_all['RescuerID'].value_counts()[:50].index)
df_dummies = pd.get_dummies(df_all['RescuerID'])[top_rescuer]
df_all = pd.concat([df_all, df_dummies], axis=1)

# split train and test again
df_train = df_all[:10000]
df_test = df_all[10000:]

# select the columns
X_columns = list(df_train.drop(['PetID', 'RescuerID',
                                'Description', 'AdoptionSpeed', 'VideoAmt'], axis=1).columns)
y_column = ['AdoptionSpeed']

# split the data using sklearn
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

(10000, 24) 
 (4993, 23)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  after removing the cwd from sys.path.


X_train (7500, 86)
y_train (7500, 1)
X_test (2500, 86)
y_test (2500, 1)


## Run the model

In [204]:
# train a Random Forest Classifier
#rfr = RandomForestClassifier(n_estimators=10)
rfr = RandomForestClassifier(n_estimators=100)
rfr.fit(X_train, y_train.values.ravel())
y_pred = rfr.predict(X_test)

kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
print('kappa', round(kappa, 4))
print(confusion_matrix(y_test, y_pred))

# get feature importances
feature_importances = pd.DataFrame(rfr.feature_importances_, 
                                   index = X_train.columns, 
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

# cross valildation
k = 10
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    rfr.fit(X_train, y_train.ravel())
    y_pred = rfr.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

kappa 0.3677
[[  2  28  18   2  24]
 [  1 203 144  53 122]
 [  0 122 263 103 167]
 [  0  67 156 154 173]
 [  0  64  95  55 484]]
                                  importance
desc_length                         0.100124
desc_words                          0.096493
avg_word_length                     0.084691
Age                                 0.080767
PhotoAmt                            0.076440
Color2                              0.049964
Breed1                              0.049852
Color1                              0.047677
Breed2                              0.032034
Gender                              0.031264
FurLength                           0.029860
MaturitySize                        0.027755
Quantity                            0.027433
Color3                              0.026361
Fee                                 0.024937
Sterilized                          0.024223
Dewormed                            0.024193
Vaccinated                          0.022899
Name            

In [205]:
# make predictions using the model
df_prediction = df_test[X_columns]
df_test['AdoptionSpeed'] = rfr.predict(df_prediction)
df_test[['PetID', 'AdoptionSpeed']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2.0
1,0118db3a8,4.0
2,e5164d828,2.0
3,5335bfb38,4.0
4,ff2cf88a0,4.0
5,1d13441b9,3.0
6,7d835cf7c,1.0
7,577d15fea,2.0
8,91736f444,4.0
9,db194aec8,1.0


In [206]:
# change datatype to int
df_test['AdoptionSpeed'] = df_test['AdoptionSpeed'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [207]:
df_test[['PetID', 'AdoptionSpeed']].to_csv('data/processed/rfr_6.csv', index=False)