In [34]:
import pandas as pd
import datetime
from dateutil import parser

from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder, scale
from sklearn import metrics

import sklearn.utils

import numpy as np
import pickle

In [16]:
def minute_of_day(row):
    d = parser.parse(row["Dates"])
    return d.hour*60 + d.minute

def day_number(row):
    "absolute number of days since 1/1/2000"
    d = parser.parse(row["Dates"])
    return (d.year - 2000)*365 + d.month * 30 + d.day

In [14]:
def preprocess(filename):
    "Read the filename and return X and y suitable for learning"
    t = pd.read_csv(filename)
    v = DictVectorizer(sparse=False)

    district = v.fit_transform(t[["PdDistrict"]].T.to_dict().values())
    day_of_week = v.fit_transform(t[["DayOfWeek"]].T.to_dict().values())
    min_of_day = t.apply(minute_of_day, axis='columns')
    day = t.apply(day_number, axis='columns')
    x = t[["X"]]
    y = t[["Y"]]
    
    category = None
    if 'Category' in t.columns:
        le = LabelEncoder()
        category = le.fit_transform(t[["Category"]].values.ravel())
        
    dataset = {
        'district': district,
        'day_of_week': day_of_week,
        'min_of_day': min_of_day,
        'day': day,
        'x': x,
        'y': y,
        'category': category
    }
    return dataset

In [17]:
train = preprocess("train.csv")

In [24]:
with open("train.pickle", "wb") as f:
    pickle.dump(train, f, protocol=pickle.HIGHEST_PROTOCOL)

# Model training

Now that we have the data in a convenient format, let's train the model.

First, let's load the data:

In [25]:
train = None
with open("train.pickle", "rb") as f:
    train = pickle.load(f)

Now let's choose the features we want and construct the X and y matrices.

In [37]:
X = np.column_stack((scale(train['day']), scale(train['min_of_day']), train['district'], train['day_of_week']))
y = train['category']



Let's take a subset of the data to play around with:

In [78]:
train_size = 100000
X, y = sklearn.utils.shuffle(X, y)
X_small = X[0:train_size,]
y_small = y[0:train_size,]

In [42]:
X[0:train_size,].shape

(30000, 19)

In [79]:
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(X_small, y_small, test_size=0.3, random_state=4)

lg = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs')
model = lg.fit(X_train, y_train)

In [80]:
y_train_predict = model.predict_proba(X_train)
y_valid_predict = model.predict_proba(X_valid)

In [81]:
len(np.unique(y_valid)), y_valid_predict.shape[1]

(38, 38)

In [82]:
print('Training loss: {0:.5}'.format(metrics.log_loss(y_train, y_train_predict)))
print('Validation loss: {0:.5}'.format(metrics.log_loss(y_valid, y_valid_predict)))

Training loss: 2.5873
Validation loss: 2.5976


# Result submission

Let's use the model we trained to make predictions on the test data and prepare it for submission.

In [26]:
X_test, _ = preprocess("test.csv")

In [31]:
y_pred = model.predict_proba(X_test)

In [64]:
ids = np.arange(0, y_pred.shape[0], dtype='int32')
submission = pd.concat([pd.DataFrame(ids, dtype='int32', columns=['Id']),
                        pd.DataFrame(y_pred, columns=[
            'ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
            'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC',
            'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES',
            'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING',
            'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
            'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT',
            'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY',
            'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE',
            'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS',
            'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'])], axis=1)

In [67]:
submission.to_csv("submission-1.csv", index=False)

dtype('float64')