In [2]:
import pandas as pd
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import math
import gzip, csv

matplotlib.style.use('ggplot')

%matplotlib inline

In [3]:
train_sets = pd.read_csv("./train.csv")
train_sets.head(2)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby


In [30]:
[x for x in test_sets.Color if x.find("Gray Tiger") >= 0]

['White/Gray Tiger']

In [4]:
test_sets = pd.read_csv("./test.csv")
test_sets.head(2)

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan


In [5]:
def logloss(act, pred, score):
    res = 0
    sample_size = len(act)
    for i in xrange(sample_size):
        res += math.log(max(score[i])) if act[i] == pred[i] else 0
    return - res / sample_size

In [67]:
from sklearn.preprocessing import OneHotEncoder

Y_train = train_sets.OutcomeType.values

def get_id_map(data_sets, name):
    ids = list(set(data_sets[name]))
    return dict([ x for x in zip(ids, xrange(len(ids))) ])
    
animal_map = get_id_map(train_sets, "AnimalType")
sexupon_map = get_id_map(train_sets, "SexuponOutcome")
age_map = get_id_map(train_sets, "AgeuponOutcome")

color_map = list(set([ x.split('/')[0] for x in train_sets.Color]))
color_map = dict([ x for x in zip(color_map, xrange(len(color_map)))])

def is_pure(breed):
    return 1 if (breed.find("Mix") >= 0 or breed.find("/") >= 0) else 0

def get_breed(breed):
    pos = breed.find("Mix")
    if pos >= 0:
        return breed[0:pos-1]
    pos = breed.find("/")
    if pos >= 0:
        return breed[0:pos]
    return breed

breed_map = list(set(map(get_breed, train_sets.Breed)))
breed_map = dict([ x for x in zip(breed_map, xrange(len(breed_map)))])

X_train_raw = [[animal_map[x[0]], sexupon_map[x[1]], age_map[x[2]], 
                color_map[x[3].split('/')[0]], is_pure(x[4]), breed_map[get_breed(x[4])] ]
                for x in zip(train_sets.AnimalType, train_sets.SexuponOutcome, 
                             train_sets.AgeuponOutcome, train_sets.Color, train_sets.Breed)]

enc = OneHotEncoder()
enc.fit(X_train_raw)

X_train = enc.transform(X_train_raw)
print X_train.shape

(26729, 333)


In [72]:
from sklearn import linear_model
from sklearn.cross_validation import train_test_split

if False:
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.3)
    logreg = linear_model.LogisticRegression(C=1e5, solver="lbfgs", multi_class="multinomial")
    logreg.fit(X_train, Y_train)
    print logloss(Y_val, logreg.predict(X_val), logreg.predict_proba(X_val))
else:
    logreg = linear_model.LogisticRegression(C=1e5, solver="lbfgs", multi_class="multinomial")
    logreg.fit(X_train, Y_train)
    print logloss(Y_train, logreg.predict(X_train), logreg.predict_proba(X_train))    

0.250019109359


In [74]:
X_test_raw = [[animal_map[x[0]], sexupon_map[x[1]], age_map.get(x[2], age_map["20 years"]), 
                color_map[x[3].split('/')[0]], is_pure(x[4]), breed_map.get(get_breed(x[4]), 0) ]
                for x in zip(test_sets.AnimalType, test_sets.SexuponOutcome, 
                             test_sets.AgeuponOutcome, test_sets.Color, test_sets.Breed)]

predicted = np.array(logreg.predict_proba(enc.transform(X_test_raw)))
labels = ['Id']
for i in logreg.classes_:
    labels.append(i)
with gzip.open('./animal_lr.csv.gz', 'wt') as outf:
    fo = csv.writer(outf, lineterminator='\n')
    fo.writerow(labels)

    for i, pred in enumerate(predicted):
        fo.writerow([i + 1] + list(pred))

In [57]:
from sklearn.ensemble import RandomForestClassifier

X_train_s, X_val, Y_train_s, Y_val = train_test_split(X_train_raw, Y_train, test_size = 0.3)

rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train_s, Y_train_s)
logloss(Y_val, rf.predict(X_val), rf.predict_proba(X_val))

0.21220774829451083

In [59]:
from sklearn.ensemble import GradientBoostingClassifier

X_train_s, X_val, Y_train_s, Y_val = train_test_split(X_train_raw, Y_train, test_size = 0.3)

gbdt = GradientBoostingClassifier(n_estimators = 200)
gbdt.fit(X_train_s, Y_train_s)
logloss(Y_val, gbdt.predict(X_val), gbdt.predict_proba(X_val))

0.2581453286149607

In [58]:
X_test_raw = [[animal_map[x[0]], sexupon_map[x[1]], age_map.get(x[2], age_map["20 years"]), 
                color_map[x[3].split('/')[0]], is_pure(x[4]) ]
                for x in zip(test_sets.AnimalType, test_sets.SexuponOutcome, 
                             test_sets.AgeuponOutcome, test_sets.Color, test_sets.Breed)]

predicted = np.array(rf.predict_proba(X_test_raw))
labels = ['Id']
for i in rf.classes_:
    labels.append(i)
with gzip.open('./animal_rf.csv.gz', 'wt') as outf:
    fo = csv.writer(outf, lineterminator='\n')
    fo.writerow(labels)

    for i, pred in enumerate(predicted):
        fo.writerow([i + 1] + list(pred))