In [171]:
%matplotlib inline

# General libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [150]:
df = pd.read_csv('train.csv', parse_dates=[0])
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [151]:
df = df.drop(['Descript', 'Resolution'], axis=1)

In [152]:
df.Dates.dtypes

dtype('<M8[ns]')

In [153]:
print (df.Category.describe(), '\n')
print (df.Category.value_counts())

count            878049
unique               39
top       LARCENY/THEFT
freq             174900
Name: Category, dtype: object 

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                313

In [154]:
df.PdDistrict.value_counts()

SOUTHERN      157182
MISSION       119908
NORTHERN      105296
BAYVIEW        89431
CENTRAL        85460
TENDERLOIN     81809
INGLESIDE      78845
TARAVAL        65596
PARK           49313
RICHMOND       45209
Name: PdDistrict, dtype: int64

In [155]:
df.PdDistrict.count()

878049

In [156]:
df.Address.value_counts()

800 Block of BRYANT ST            26533
800 Block of MARKET ST             6581
2000 Block of MISSION ST           5097
1000 Block of POTRERO AV           4063
900 Block of MARKET ST             3251
0 Block of TURK ST                 3228
0 Block of 6TH ST                  2884
300 Block of ELLIS ST              2703
400 Block of ELLIS ST              2590
16TH ST / MISSION ST               2504
1000 Block of MARKET ST            2489
1100 Block of MARKET ST            2319
2000 Block of MARKET ST            2168
100 Block of OFARRELL ST           2140
700 Block of MARKET ST             2081
3200 Block of 20TH AV              2035
100 Block of 6TH ST                1887
500 Block of JOHNFKENNEDY DR       1824
TURK ST / TAYLOR ST                1810
200 Block of TURK ST               1800
0 Block of PHELAN AV               1791
0 Block of UNITEDNATIONS PZ        1789
0 Block of POWELL ST               1717
100 Block of EDDY ST               1681
1400 Block of PHELPS ST            1629


In [157]:
dfnew = df[df.Address == '800 Block of BRYANT ST']
dfnew.Category.value_counts()

LARCENY/THEFT                  6144
NON-CRIMINAL                   5583
OTHER OFFENSES                 3019
ASSAULT                        1926
WARRANTS                       1719
VANDALISM                      1155
SUSPICIOUS OCC                 1153
FRAUD                           942
DRUG/NARCOTIC                   787
MISSING PERSON                  777
FORGERY/COUNTERFEITING          614
ROBBERY                         591
BURGLARY                        384
SECONDARY CODES                 312
SEX OFFENSES FORCIBLE           296
WEAPON LAWS                     156
TRESPASS                        150
VEHICLE THEFT                   128
DRUNKENNESS                     100
KIDNAPPING                       97
STOLEN PROPERTY                  93
DISORDERLY CONDUCT               76
PROSTITUTION                     55
DRIVING UNDER THE INFLUENCE      41
ARSON                            41
RUNAWAY                          36
LIQUOR LAWS                      29
EMBEZZLEMENT                

In [195]:
df.X.describe()

count    878049.000000
mean       -122.422616
std           0.030354
min        -122.513642
25%        -122.432952
50%        -122.416420
75%        -122.406959
max        -120.500000
Name: X, dtype: float64

In [202]:
df['DOW'] = df.Dates.dt.weekday

In [203]:
df['DOM'] = df.Dates.dt.day

In [204]:
df['MONTH'] = df.Dates.dt.month

In [205]:
df['Hour'] = df.Dates.dt.hour

In [206]:
df.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y,DOW,DOM,MONTH,Hour
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2,13,5,23
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2,13,5,23
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,2,13,5,23
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873,2,13,5,23
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541,2,13,5,23


In [208]:
df_abbrev_train = df[['PdDistrict', 'DOM', 'DOW', 'MONTH', 'Hour']]

In [209]:
df_label = df.Category

In [210]:
df_abbrev_train.head()

Unnamed: 0,PdDistrict,DOM,DOW,MONTH,Hour
0,NORTHERN,13,2,5,23
1,NORTHERN,13,2,5,23
2,NORTHERN,13,2,5,23
3,NORTHERN,13,2,5,23
4,PARK,13,2,5,23


In [211]:
df_PdD = pd.get_dummies(df.PdDistrict)
df_DOM = pd.get_dummies(df.DOM, prefix = 'DofM')
df_DOW = pd.get_dummies(df.DOW, prefix = 'DofW')
df_MONTH = pd.get_dummies(df.MONTH, prefix = 'Month')
df_Hour = pd.get_dummies(df.Hour, prefix = 'Hour')

In [212]:
newdf = pd.concat([df_PdD, df_DOM], axis = 1)
newdf = pd.concat([newdf, df_DOW], axis = 1)
newdf = pd.concat([newdf, df_MONTH], axis = 1)
newdf = pd.concat([newdf, df_Hour], axis = 1)
newdf.head()
newdf.shape

(878049, 84)

In [213]:
X_train, X_dev, y_train, y_dev = train_test_split(newdf, df_label, test_size = 0.2, random_state= 42)

In [214]:
print (X_train.shape)
print (X_dev.shape)

(702439, 84)
(175610, 84)


In [215]:
print (y_train.shape)
print (y_dev.shape)

(702439,)
(175610,)


In [219]:
lr = LogisticRegression( )
lr.fit(X_train, y_train)
lr.predict(X_dev)

array(['DRUG/NARCOTIC', 'LARCENY/THEFT', 'LARCENY/THEFT', ...,
       'LARCENY/THEFT', 'OTHER OFFENSES', 'DRUG/NARCOTIC'], dtype=object)

In [220]:
print( lr.predict(X_dev)[0:5] , y_dev[0:5])


['DRUG/NARCOTIC' 'LARCENY/THEFT' 'LARCENY/THEFT' 'LARCENY/THEFT'
 'LARCENY/THEFT'] 349598     LARCENY/THEFT
766313           ASSAULT
169887    OTHER OFFENSES
594704       WEAPON LAWS
47900      LARCENY/THEFT
Name: Category, dtype: object


In [221]:
print (metrics.accuracy_score(y_dev, lr.predict(X_dev)))

0.22399066112408178


In [216]:
# train using MultinomialNB
nb1 = MultinomialNB(alpha=2)
nb1.fit(X_train, y_train)

MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

In [217]:
y_pred = nb1.predict(X_dev)

In [218]:
nb1.predict_proba(X_dev)[0]
print( nb1.predict(X_dev)[0:5] , y_dev[0:5])

print (metrics.accuracy_score(y_dev, y_pred))

['DRUG/NARCOTIC' 'LARCENY/THEFT' 'LARCENY/THEFT' 'LARCENY/THEFT'
 'LARCENY/THEFT'] 349598     LARCENY/THEFT
766313           ASSAULT
169887    OTHER OFFENSES
594704       WEAPON LAWS
47900      LARCENY/THEFT
Name: Category, dtype: object
0.22430954957007004
