In [1]:
import pandas

import sklearn

import matplotlib

import matplotlib.pyplot as plt

import numpy as np

import math

from sklearn.metrics import roc_auc_score

import gzip, csv

matplotlib.style.use('ggplot')

%matplotlib inline

In [2]:
train_sets = pandas.read_csv("/Users/zhouyu/Documents/workspace/train.csv")
train_sets.head(2)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599


In [3]:
test_sets = pandas.read_csv("/Users/zhouyu/Documents/workspace/test.csv")
test_sets.head(2)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432


In [4]:
def llfun(act, pred):
    """ Logloss function for 1/0 probability
    """
    return (-(~(act == pred)).astype(int) * math.log(1e-15)).sum() / len(act)

In [4]:
def logloss(act, pred, score):
    res = 0
    sample_size = len(act)
    for i in xrange(sample_size):
        res += math.log(max(score[i])) if act[i] == pred[i] else 0
    return - res / sample_size

In [5]:
from sklearn.cluster import KMeans

train_xy = train_sets[["X", "Y"]]
clu = KMeans(n_clusters = 40)
clu.fit(train_xy)
clu_index = clu.predict(train_xy)

len(clu_index)

878049

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

Y_train = train_sets.Category.values

districts = list(set(train_sets.PdDistrict))
dist_map = dict(zip(districts, xrange(len(districts))))

days = list(set(train_sets.DayOfWeek))
day_map = dict([x for x in zip(days, xrange(len(days)))])

def get_feats(data_set, cindex, dist_map, day_map, b_encode):
    data_raw = [ [dist_map[x[0]], day_map[x[3]], 
                  x[2], int(x[1][0:4]), int(x[1][5:7]), int(x[1][11:13])]
                for x in zip(data_set.PdDistrict, data_set.Dates, cindex, data_set.DayOfWeek)]

    data_raw = [x + [x[1] * 100 + x[5], x[0] * 100 + x[1], x[3] * 100 + x[4], 
                     x[0] * 100 + x[4], x[0] * 100 + x[5], x[0] * 10000 + x[3]] 
                for x in data_raw]
    
    if b_encode:
        enc = OneHotEncoder()
        return enc.fit_transform(data_raw)
    else:
        return data_raw

X_train = get_feats(train_sets, clu_index, dist_map, day_map, True)
print X_train.shape[0], len(X_train.toarray()[0])

878049 983


In [29]:
from sklearn import linear_model

logreg = linear_model.LogisticRegression(C=1e5, solver="lbfgs", multi_class="multinomial")
logreg.fit(X_train, Y_train)

outcome = logreg.predict(X_train)
logloss(Y_train, outcome, logreg.predict_proba(X_train))

0.33152503409915046

In [30]:
test_index = clu.predict(test_sets[["X", "Y"]])

X_test = get_feats(test_sets, test_index, dist_map, day_map, True)

predicted = np.array(logreg.predict_proba(X_test))
labels = ['Id']
for i in logreg.classes_:
    labels.append(i)
with gzip.open('logis_reg.csv.gz', 'wt') as outf:
    fo = csv.writer(outf, lineterminator='\n')
    fo.writerow(labels)

    for i, pred in enumerate(predicted):
        fo.writerow([i] + list(pred))

In [8]:
from sklearn.ensemble import RandomForestClassifier

X_train_raw = get_feats(train_sets, clu_index, dist_map, day_map, False)
print len(X_train_raw)

rf = RandomForestClassifier(n_estimators = 60)
rf.fit(X_train_raw, Y_train)
logloss(Y_train, rf.predict(X_train_raw), rf.predict_proba(X_train_raw))

878049


0.3326544452242097

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC()
svc.fit(X_train, Y_train)
logloss(Y_train, svc.predict(X_train), logreg.predict_proba(X_train))