In [35]:
import pandas as pd
import datetime
from dateutil import parser

from sklearn import linear_model
from sklearn import cross_validation
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import numpy as np

In [2]:
def minute_of_day(row):
    d = parser.parse(row["Dates"])
    return d.hour*60 + d.minute

def hour_of_day(row):
    d = parser.parse(row["Dates"])
    return d.hour


In [23]:
def preprocess(filename):
    "Read the filename and return X and y suitable for learning"
    t = pd.read_csv(filename)
    v = DictVectorizer(sparse=False)
    X = v.fit_transform(t[["PdDistrict", "DayOfWeek"]].T.to_dict().values())
    y = None
    if 'Category' in t.columns:
        le = LabelEncoder()
        y = le.fit_transform(t[["Category"]].values.ravel())
    return X, y

In [4]:
X, y = preprocess("train.csv")

In [22]:
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(X, y, test_size=0.3)

lg = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs')
model = lg.fit(X_train, y_train)

In [24]:
y_predict = model.predict_proba(X_valid)

In [25]:
metrics.log_loss(y_valid, y_predict)

2.6133194529465631

In [26]:
X_test, _ = preprocess("test.csv")

In [31]:
y_pred = model.predict_proba(X_test)

In [51]:
ids = np.arange(0, y_pred.shape[0])
submission = pd.DataFrame(np.column_stack((ids, y_pred)))
submission.columns = ['Id', 'ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC',
       'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES',
       'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING',
       'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
       'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT',
       'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY',
       'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE',
       'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS',
       'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']

In [56]:
submission.to_csv("submission-1.csv", index=False)