In [3]:
import pandas as pd
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import math
import gzip, csv

matplotlib.style.use('ggplot')

%matplotlib inline

In [4]:
train_sets = pd.read_csv("./train.csv")
train_sets.head(2)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby


In [95]:
train_sets.groupby("Breed").size()

Breed
Abyssinian Mix                                       2
Affenpinscher Mix                                    6
Afghan Hound Mix                                     1
Airedale Terrier                                     1
Airedale Terrier Mix                                 5
Airedale Terrier/Labrador Retriever                  1
Airedale Terrier/Miniature Schnauzer                 1
Akita                                                3
Akita Mix                                           11
Akita/Australian Cattle Dog                          1
Akita/Chow Chow                                      1
Akita/German Shepherd                                1
Akita/Labrador Retriever                             1
Akita/Pit Bull                                       1
Akita/Siberian Husky                                 2
Alaskan Husky                                        2
Alaskan Husky Mix                                   10
Alaskan Husky/Australian Shepherd                    2
Alas

In [58]:
test_sets = pd.read_csv("./test.csv")
test_sets.head(2)

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan


In [15]:
def logloss(act, pred, score):
    res = 0
    sample_size = len(act)
    for i in xrange(sample_size):
        res += math.log(max(score[i])) if act[i] == pred[i] else 0
    return - res / sample_size

In [93]:
from sklearn.preprocessing import OneHotEncoder

Y_train = train_sets.OutcomeType.values

def get_id_map(data_sets, name):
    ids = list(set(data_sets[name]))
    return dict([ x for x in zip(ids, xrange(len(ids))) ])
    
animal_map = get_id_map(train_sets, "AnimalType")
sexupon_map = get_id_map(train_sets, "SexuponOutcome")
age_map = get_id_map(train_sets, "AgeuponOutcome")

color_map = list(set([ x.split('/')[-1] for x in train_sets.Color]))
color_map = dict([ x for x in zip(color_map, xrange(len(color_map)))])

X_train_raw = [[animal_map[x[0]], sexupon_map[x[1]], age_map[x[2]], color_map[x[3].split('/')[-1]] ]
                for x in zip(train_sets.AnimalType, train_sets.SexuponOutcome, 
                             train_sets.AgeuponOutcome, train_sets.Color)]

enc = OneHotEncoder()
enc.fit(X_train_raw)

X_train = enc.transform(X_train_raw)
print X_train.shape

(26729, 108)


In [94]:
from sklearn import linear_model

logreg = linear_model.LogisticRegression(C=1e5, solver="lbfgs", multi_class="multinomial")
logreg.fit(X_train, Y_train)
logloss(Y_train, logreg.predict(X_train), logreg.predict_proba(X_train))

0.259757423986945

In [62]:
X_test_raw = [[animal_map[x[0]], sexupon_map[x[1]], age_map.get(x[2], age_map["20 years"])] 
                for x in zip(test_sets.AnimalType, test_sets.SexuponOutcome, test_sets.AgeuponOutcome)]

predicted = np.array(logreg.predict_proba(enc.transform(X_test_raw)))
labels = ['Id']
for i in logreg.classes_:
    labels.append(i)
with gzip.open('./animal.csv.gz', 'wt') as outf:
    fo = csv.writer(outf, lineterminator='\n')
    fo.writerow(labels)

    for i, pred in enumerate(predicted):
        fo.writerow([i + 1] + list(pred))