In [6]:
import pandas as pd
import datetime
from dateutil import parser

from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder, scale
from sklearn import metrics

import numpy as np

In [2]:
def minute_of_day(row):
    d = parser.parse(row["Dates"])
    return d.hour*60 + d.minute

In [3]:
def preprocess(filename):
    "Read the filename and return X and y suitable for learning"
    t = pd.read_csv(filename)
    v = DictVectorizer(sparse=False)
    X = v.fit_transform(t[["PdDistrict", "DayOfWeek"]].T.to_dict().values())
    min_of_day = scale(t.apply(minute_of_day, axis='columns'))
    X = np.column_stack((X, min_of_day))
    y = None
    if 'Category' in t.columns:
        le = LabelEncoder()
        y = le.fit_transform(t[["Category"]].values.ravel())
    return X, y

In [81]:
t = pd.read_csv("train.csv")

In [110]:
scaled_min = scale(min_of_day)
X.shape, scaled_min.shape
np.column_stack((X, scaled_min)).shape



(878049, 20)

In [4]:
X, y = preprocess("train.csv")



In [7]:
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(X, y, test_size=0.3, random_state=4)

#lg = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs')
#model = lg.fit(X_train, y_train)

nn = KNeighborsClassifier()
model = nn.fit(X_train, y_train)

In [8]:
y_train_predict = model.predict_proba(X_train)
y_valid_predict = model.predict_proba(X_valid)

In [115]:
print('Training loss: {0:.5}'.format(metrics.log_loss(y_train, y_train_predict)))
print('Validation loss: {0:.5}'.format(metrics.log_loss(y_valid, y_valid_predict)))

Training loss: 2.6036
Validation loss: 2.6057


In [26]:
X_test, _ = preprocess("test.csv")

In [31]:
y_pred = model.predict_proba(X_test)

In [64]:
ids = np.arange(0, y_pred.shape[0], dtype='int32')
submission = pd.concat([pd.DataFrame(ids, dtype='int32', columns=['Id']),
                        pd.DataFrame(y_pred, columns=[
            'ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
            'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC',
            'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES',
            'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING',
            'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
            'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT',
            'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY',
            'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE',
            'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS',
            'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'])], axis=1)

In [67]:
submission.to_csv("submission-1.csv", index=False)

dtype('float64')