In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from imblearn.over_sampling import SMOTE
import category_encoders as ce

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('train.csv', header=0)

train_data.drop('animal_id_outcome', axis=1, inplace=True)

train_data.shape

(47803, 36)

In [3]:
test_data = pd.read_csv('test.csv', header=0)

test_data['outcome_type'] = 'Missing'
animal_id = test_data.animal_id_outcome
test_data.drop('animal_id_outcome', axis=1, inplace=True)

test_data.shape

(31869, 36)

In [4]:
data = pd.concat([train_data, test_data])
data.shape

(79672, 36)

In [5]:
data.drop('outcome_datetime', axis=1, inplace=True)

In [6]:
data.fillna('Unknown', inplace = True)

In [7]:
data.isnull().sum()

dob_year                      0
dob_month                     0
age_upon_intake               0
animal_type                   0
breed                         0
color                         0
intake_condition              0
intake_type                   0
sex_upon_intake               0
count                         0
age_upon_intake_(days)        0
age_upon_intake_(years)       0
age_upon_intake_age_group     0
intake_datetime               0
intake_month                  0
intake_year                   0
intake_monthyear              0
intake_weekday                0
intake_hour                   0
intake_number                 0
time_in_shelter               0
time_in_shelter_days          0
age_upon_outcome              0
date_of_birth                 0
sex_upon_outcome              0
age_upon_outcome_(days)       0
age_upon_outcome_(years)      0
age_upon_outcome_age_group    0
outcome_month                 0
outcome_year                  0
outcome_monthyear             0
outcome_

In [8]:
targets = data.outcome_type.unique()

In [9]:
data.drop(['age_upon_intake', 'count', 'age_upon_intake_(years)',
           'intake_datetime', 'intake_monthyear', 'time_in_shelter',
           'age_upon_outcome', 'date_of_birth', 'age_upon_outcome_(years)',
           'outcome_monthyear'], axis=1, inplace=True)

In [10]:
categoryColumns = []
numericalColumns = []
for col in data.columns:
    if data[col].dtype == 'object':
        categoryColumns.append(col)
    else:
        numericalColumns.append(col)
print('Category Columns: ',categoryColumns, '\n')
print('Numerical Columns: ',numericalColumns)

Category Columns:  ['animal_type', 'breed', 'color', 'intake_condition', 'intake_type', 'sex_upon_intake', 'age_upon_intake_age_group', 'intake_weekday', 'sex_upon_outcome', 'age_upon_outcome_age_group', 'outcome_weekday', 'outcome_type'] 

Numerical Columns:  ['dob_year', 'dob_month', 'age_upon_intake_(days)', 'intake_month', 'intake_year', 'intake_hour', 'intake_number', 'time_in_shelter_days', 'age_upon_outcome_(days)', 'outcome_month', 'outcome_year', 'outcome_hour', 'outcome_number']


In [11]:
le = preprocessing.LabelEncoder() 
data[categoryColumns] = data[categoryColumns].apply(le.fit_transform)

data.head()

Unnamed: 0,dob_year,dob_month,animal_type,breed,color,intake_condition,intake_type,sex_upon_intake,age_upon_intake_(days),age_upon_intake_age_group,...,time_in_shelter_days,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_age_group,outcome_month,outcome_year,outcome_weekday,outcome_hour,outcome_number,outcome_type
0,2007,7,2,1982,527,3,3,2,3650,9,...,0.588194,2,3650,9,12,2017,4,0,1.0,6
1,2007,7,2,1982,527,3,2,2,2555,8,...,1.259722,2,2555,8,12,2014,2,16,2.0,6
2,2007,7,2,1982,527,3,2,2,2190,8,...,1.113889,2,2190,8,3,2014,2,17,3.0,6
3,2004,4,2,830,449,3,1,2,3650,9,...,4.970139,2,3650,9,4,2014,1,15,1.0,4
4,1997,10,2,1880,183,2,2,2,5840,3,...,0.119444,2,5840,3,11,2013,2,11,1.0,6


In [12]:
data.drop(['dob_year', 'age_upon_intake_(days)', 'intake_month', 'intake_year', 'intake_number'], axis=1, inplace=True)

In [13]:
data.drop(['age_upon_intake_age_group'], axis=1, inplace=True)

In [14]:
irrelevant = ['dob_month', 'color', 'intake_weekday', 'outcome_month']
data.drop(irrelevant, axis=1, inplace=True)

In [15]:
data.shape

(79672, 15)

In [16]:
X, y = data.drop('outcome_type',axis=1), data['outcome_type']
X_train, X_test, y_train = X.iloc[0:47803, :], X.iloc[47803:, :], y[0:47803]

# sm = SMOTE(random_state=42)
# X_train, y_train = sm.fit_resample(X_train, y_train)

print('Train Shape', X_train.shape, y_train.shape)
print('Test Shape', X_test.shape)

Train Shape (47803, 14) (47803,)
Test Shape (31869, 14)


In [17]:
# feature_list = ['animal_type', 'breed', 'intake_condition', 'intake_type', 'outcome_number']

# MEE_encoder = ce.MEstimateEncoder()
# X_train[feature_list] = MEE_encoder.fit_transform(X_train[feature_list], y_train)
# X_test[feature_list] = MEE_encoder.transform(X_test[feature_list])

In [67]:
model = XGBClassifier(colsample_bytree= 0.5, max_depth= 10, n_estimators= 100)
model = model.fit(X_train, y_train)

In [68]:
y_pred = model.predict(X_test)

In [69]:
outcome_type = pd.DataFrame(y_pred)

In [70]:
prediction = pd.concat([animal_id, outcome_type],axis=1)
prediction.columns = ['animal_id_outcome', 'outcome_type']

In [71]:
prediction.head()

Unnamed: 0,animal_id_outcome,outcome_type
0,A723133,8
1,A723134,0
2,A723135,0
3,A723136,1
4,A723137,0


In [72]:
target_names = [''] * 9
for val, indx in zip(targets, data.outcome_type.unique()):
    target_names[indx] = val

In [73]:
data.outcome_type.unique()

array([6, 4, 8, 0, 3, 7, 5, 1, 2])

In [74]:
prediction.outcome_type = prediction.outcome_type.apply(lambda x : target_names[x])

In [75]:
prediction.head(10)

Unnamed: 0,animal_id_outcome,outcome_type
0,A723133,Transfer
1,A723134,Adoption
2,A723135,Adoption
3,A723136,Died
4,A723137,Adoption
5,A723138,Return to Owner
6,A723139,Transfer
7,A723140,Adoption
8,A723141,Adoption
9,A723142,Transfer


In [76]:
prediction.to_csv('prediction15.csv',index=False)