# Shelter Animal Outcomes

https://www.kaggle.com/c/shelter-animal-outcomes

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
animals = pd.read_csv('data/shelter-train.csv')

In [None]:
animals.head(2)

In [None]:
animals.AgeuponOutcome.value_counts().plot(kind='bar', figsize=(10, 6))

In [None]:
sns.countplot(data=animals, x=animals.AgeuponOutcome)

In [None]:
g = sns.countplot(data=animals, x=animals.AgeuponOutcome)
g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
f, ax = plt.subplots(1, 1, figsize=(10, 6))
g = sns.countplot(data=animals, x=animals.AgeuponOutcome, ax=ax)
g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
def get_age_in_days(age_upon_outcome):
    if str(age_upon_outcome) == 'nan':
        return 0
    time_value, unit = age_upon_outcome.split(' ')
    if unit == 'year' or unit == 'years':
        return int(time_value) * 365
    if unit in ['month', 'months']:
        return int(time_value) * 30
    if unit in ['week', 'weeks']:
        return int(time_value) * 7
    if unit in ['day', 'days']:
        return int(time_value)

In [None]:
animals['AgeInDays'] = animals.AgeuponOutcome.map(get_age_in_days)

In [None]:
animals.head(3)

In [None]:
animals.boxplot(column=['AgeInDays'], by='OutcomeType', figsize=(10, 6))

In [None]:
f, ax = plt.subplots(1, 1, figsize=(10, 6))
sns.boxplot(data=animals, x='OutcomeType', y='AgeInDays', ax=ax)

In [None]:
pd.cut(animals.AgeInDays, list(range(0, 7000, 100))).head(5)
animals.groupby(pd.cut(animals.AgeInDays, list(range(0, 7000, 350)))).mean()
avg_data_by_age = animals.groupby(pd.cut(animals.AgeInDays, list(range(0, 7000, 350)))).count()
avg_data_by_age.AgeInDays.plot(kind='bar')

In [None]:
sns.distplot(animals.AgeInDays, bins=20, kde=False)

ดู distribution ของสัตว์แต่ละชนิด

In [None]:
animals.AnimalType.value_counts().plot(kind='bar')

In [None]:
sns.countplot(data=animals, x=animals.AnimalType)

ดู distribution ของ outcome type

In [None]:
animals['OutcomeType'].value_counts().plot(kind='bar')

In [None]:
sns.countplot(data=animals, x=animals.OutcomeType)

เทียบ distribution ของสัตว์แต่ละชนิดโดยแยกตาม outcome type

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
animals[['AnimalType', 'OutcomeType']].groupby(['OutcomeType', 'AnimalType']).size().unstack().plot(kind='bar', ax=ax1, rot=0)
animals[['AnimalType', 'OutcomeType']].groupby(['AnimalType', 'OutcomeType']).size().unstack().plot(kind='bar', ax=ax2, rot=0)

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
sns.countplot(data=animals, x='OutcomeType', hue='AnimalType', ax=ax1)
sns.countplot(data=animals, x='AnimalType',hue='OutcomeType', ax=ax2)

ดู distribution ของเพศ

In [None]:
animals['SexuponOutcome'].value_counts().plot(kind='bar')

In [None]:
sns.countplot(data=animals, x=animals.SexuponOutcome)

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
animals[['SexuponOutcome', 'OutcomeType']].groupby(['OutcomeType', 'SexuponOutcome']).size().unstack().plot(kind='bar', ax=ax1)
animals[['SexuponOutcome', 'OutcomeType']].groupby(['SexuponOutcome', 'OutcomeType']).size().unstack().plot(kind='bar', ax=ax2)

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
sns.countplot(data=animals, x='OutcomeType', hue='SexuponOutcome', ax=ax1)
sns.countplot(data=animals, x='SexuponOutcome',hue='OutcomeType', ax=ax2)

In [None]:
def get_sex(x):
    x = str(x)
    if 'Male' in x: return 'male'
    if 'Female' in x: return 'female'
    return 'unknown'

In [None]:
animals['Sex'] = animals.SexuponOutcome.apply(get_sex)

In [None]:
animals.Sex.value_counts().plot(kind='bar')

In [None]:
sns.countplot(x=animals.Sex)

In [None]:
def get_neutered(x):
    x = str(x)
    if 'Spayed' in x: return 'neutered'
    if 'Neutered' in x: return 'neutered'
    if 'Intact' in x: return 'intact'
    return 'unknown'

In [None]:
animals['Neutered'] = animals.SexuponOutcome.apply(get_neutered)

In [None]:
animals.Neutered.value_counts().plot(kind='bar')

In [None]:
sns.countplot(x=animals.Neutered)

In [None]:
_, (ax1, ax2) = plt.subplots(2, 2, figsize=(16, 8), )
animals[['Sex', 'OutcomeType']].groupby(['OutcomeType', 'Sex']).size().unstack().plot(kind='bar', ax=ax1[0], rot=0)
animals[['Sex', 'OutcomeType']].groupby(['Sex', 'OutcomeType']).size().unstack().plot(kind='bar', ax=ax1[1], rot=0)
animals[['Neutered', 'OutcomeType']].groupby(['OutcomeType', 'Neutered']).size().unstack().plot(kind='bar', ax=ax2[0], rot=0)
animals[['Neutered', 'OutcomeType']].groupby(['Neutered', 'OutcomeType']).size().unstack().plot(kind='bar', ax=ax2[1], rot=0)

In [None]:
_, (ax1, ax2) = plt.subplots(2, 2, figsize=(16, 8))
sns.countplot(data=animals, x='OutcomeType', hue='Sex', ax=ax1[0])
sns.countplot(data=animals, x='Sex', hue='OutcomeType', ax=ax1[1])
sns.countplot(data=animals, x='OutcomeType', hue='Neutered', ax=ax2[0])
sns.countplot(data=animals, x='Neutered', hue='OutcomeType', ax=ax2[1])

ช่วงเวลาก็อาจจะมีส่วนในการตัดสินใจ?

In [None]:
animals['Converted-DateTime'] = pd.to_datetime(animals["DateTime"]).dt.date

In [None]:
animals.head(1)

In [None]:
animals.info()

In [None]:
monthGroup = animals["Converted-DateTime"].groupby(animals["OutcomeType"])

In [None]:
f, ax = plt.subplots(5, 1, figsize=(16, 17))

animals[animals.OutcomeType == 'Adoption'].groupby('Converted-DateTime')['Converted-DateTime'].count().plot(ax=ax[0])
animals[animals.OutcomeType == 'Died'].groupby('Converted-DateTime')['Converted-DateTime'].count().plot(ax=ax[1])
animals[animals.OutcomeType == 'Euthanasia'].groupby('Converted-DateTime')['Converted-DateTime'].count().plot(ax=ax[2])
animals[animals.OutcomeType == 'Return_to_owner'].groupby('Converted-DateTime')['Converted-DateTime'].count().plot(ax=ax[3])
animals[animals.OutcomeType == 'Transfer'].groupby('Converted-DateTime')['Converted-DateTime'].count().plot(ax=ax[4])

In [None]:
f, ax = plt.subplots(5, 1, figsize=(16, 17))

month_group = animals['Converted-DateTime'].groupby(animals['OutcomeType'])
for i, g in enumerate(month_group):
    g[1].groupby(animals['Converted-DateTime']).count().plot(ax=ax[i])

In [None]:
animals.head(3)

In [None]:
X = animals.drop(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype', 'SexuponOutcome', 'AgeuponOutcome', 'Converted-DateTime'], axis=1)
y = animals['OutcomeType']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X.isnull().sum()

In [None]:
X.count()

In [None]:
y.count()

In [None]:
animals_test = pd.read_csv("data/shelter-test.csv")

In [None]:
from sklearn import preprocessing

le_animal_type = preprocessing.LabelEncoder()
le_animal_type.fit(X.AnimalType)
X.AnimalType = le_animal_type.transform(X.AnimalType)

In [None]:
le_sex = preprocessing.LabelEncoder()
le_sex.fit(X.Sex)
X.Sex = le_sex.transform(X.Sex)

le_neutered = preprocessing.LabelEncoder()
le_neutered.fit(X.Neutered)
X.Neutered = le_neutered.transform(X.Neutered)

le_breed = preprocessing.LabelEncoder()
le_breed.fit(X.Breed.append(animals_test.Breed))
X.Breed = le_breed.transform(X.Breed)

le_color = preprocessing.LabelEncoder()
le_color.fit(X.Color.append(animals_test.Color))
X.Color = le_color.transform(X.Color)

le_out = preprocessing.LabelEncoder()
le_out.fit(y)
y = le_out.transform(y)

In [None]:
X.head()

In [None]:
y

In [None]:
le_out.inverse_transform(y)

In [None]:
from sklearn import model_selection, neighbors

k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

knn = neighbors.KNeighborsClassifier()
grid = model_selection.GridSearchCV(knn, param_grid, cv=3, scoring='accuracy', return_train_score=True)
grid.fit(X, y)

grid_mean_scores = [result for result in grid.cv_results_['mean_test_score']]

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

**Warning:** Cross validation for Randorm Forest classifier takes time.

In [None]:
from sklearn import model_selection, ensemble

n_range = [1000, 1500]
param_grid = dict(n_estimators=n_range)

rf = ensemble.RandomForestClassifier()
grid = model_selection.GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', return_train_score=True)
grid.fit(X, y)

grid_mean_scores = [result for result in grid.cv_results_['mean_test_score']]

In [None]:
plt.plot(n_range, grid_mean_scores)
plt.xlabel('Value of estimators for Random Forest')
plt.ylabel('Cross-Validated Accuracy')

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

## ลองใช้โมเดลกับชุดข้อมูลทดสอบ

In [None]:
animals_test = pd.read_csv('data/shelter-test.csv')
animals_test.head()

In [None]:
animals_test['AgeInDays'] = animals_test.AgeuponOutcome.map(get_age_in_days)
animals_test['Sex'] = animals_test.SexuponOutcome.apply(get_sex)
animals_test['Neutered'] = animals_test.SexuponOutcome.apply(get_neutered)

In [None]:
animals_test.head(3)

In [None]:
X_test = animals_test.drop(['ID', 'Name', 'DateTime', 'SexuponOutcome', 'AgeuponOutcome'], axis=1)

In [None]:
X_test.AnimalType = le_animal_type.transform(X_test.AnimalType)
X_test.Sex = le_sex.transform(X_test.Sex)
X_test.Neutered = le_neutered.transform(X_test.Neutered)
X_test.Breed = le_breed.transform(X_test.Breed)
X_test.Color = le_color.transform(X_test.Color)

In [None]:
X_test.head()

In [None]:
from sklearn import ensemble

rf = ensemble.RandomForestClassifier(n_estimators=1000)

rf.fit(X, y)

y_pred_class = rf.predict(X_test)

In [None]:
rf.feature_importances_

In [None]:
y_pred_class

In [None]:
y_pred_class = le_out.inverse_transform(y_pred_class)
y_pred_class

สร้างไฟล์ submission เตรียมส่ง Kaggle

In [None]:
import csv

def create_submission(y_pred_class):
    f = open('output/submission.csv', 'w')
    filewrite = csv.writer(f)
    filewrite.writerow(['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
    for index, each in enumerate(y_pred_class):
        result = [str(index + 1)]
        if each == 'Adoption':
            result.append(1)
        else:
            result.append(0)
        if each == 'Died':
            result.append(1)
        else:
            result.append(0)
        if each == 'Euthanasia':
            result.append(1)
        else:
            result.append(0)
        if each == 'Return_to_owner':
            result.append(1)
        else:
            result.append(0)
        if each == 'Transfer':
            result.append(1)
        else:
            result.append(0)
            
        filewrite.writerow(result)
        
    f.close()

In [None]:
create_submission(y_pred_class)