In [74]:
%matplotlib inline

# General libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn import metrics


In [75]:
df = pd.read_csv('train.csv', parse_dates=[0])
df = df.drop(['Descript', 'Resolution'], axis=1)

In [76]:
print (df.shape, '\n')
print (df.head(), '\n')
print (df.dtypes)

(878049, 7) 

                Dates        Category  DayOfWeek PdDistrict  \
0 2015-05-13 23:53:00        WARRANTS  Wednesday   NORTHERN   
1 2015-05-13 23:53:00  OTHER OFFENSES  Wednesday   NORTHERN   
2 2015-05-13 23:33:00  OTHER OFFENSES  Wednesday   NORTHERN   
3 2015-05-13 23:30:00   LARCENY/THEFT  Wednesday   NORTHERN   
4 2015-05-13 23:30:00   LARCENY/THEFT  Wednesday       PARK   

                     Address           X          Y  
0         OAK ST / LAGUNA ST -122.425892  37.774599  
1         OAK ST / LAGUNA ST -122.425892  37.774599  
2  VANNESS AV / GREENWICH ST -122.424363  37.800414  
3   1500 Block of LOMBARD ST -122.426995  37.800873  
4  100 Block of BRODERICK ST -122.438738  37.771541   

Dates         datetime64[ns]
Category              object
DayOfWeek             object
PdDistrict            object
Address               object
X                    float64
Y                    float64
dtype: object


In [77]:
print (df.Category.describe(), '\n')
print (df.Category.value_counts())

count            878049
unique               39
top       LARCENY/THEFT
freq             174900
Name: Category, dtype: object 

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                313

In [78]:
df.PdDistrict.value_counts()

SOUTHERN      157182
MISSION       119908
NORTHERN      105296
BAYVIEW        89431
CENTRAL        85460
TENDERLOIN     81809
INGLESIDE      78845
TARAVAL        65596
PARK           49313
RICHMOND       45209
Name: PdDistrict, dtype: int64

In [79]:
df.PdDistrict.describe()

count       878049
unique          10
top       SOUTHERN
freq        157182
Name: PdDistrict, dtype: object

In [80]:
df.Address.describe()

count                     878049
unique                     23228
top       800 Block of BRYANT ST
freq                       26533
Name: Address, dtype: object

In [81]:
dfnew = df[df.Address == '800 Block of BRYANT ST']
dfnew.Category.value_counts()

LARCENY/THEFT                  6144
NON-CRIMINAL                   5583
OTHER OFFENSES                 3019
ASSAULT                        1926
WARRANTS                       1719
VANDALISM                      1155
SUSPICIOUS OCC                 1153
FRAUD                           942
DRUG/NARCOTIC                   787
MISSING PERSON                  777
FORGERY/COUNTERFEITING          614
ROBBERY                         591
BURGLARY                        384
SECONDARY CODES                 312
SEX OFFENSES FORCIBLE           296
WEAPON LAWS                     156
TRESPASS                        150
VEHICLE THEFT                   128
DRUNKENNESS                     100
KIDNAPPING                       97
STOLEN PROPERTY                  93
DISORDERLY CONDUCT               76
PROSTITUTION                     55
DRIVING UNDER THE INFLUENCE      41
ARSON                            41
RUNAWAY                          36
LIQUOR LAWS                      29
EMBEZZLEMENT                

In [82]:
df.X.describe()

count    878049.000000
mean       -122.422616
std           0.030354
min        -122.513642
25%        -122.432952
50%        -122.416420
75%        -122.406959
max        -120.500000
Name: X, dtype: float64

In [83]:
df.Y.describe()

count    878049.000000
mean         37.771020
std           0.456893
min          37.707879
25%          37.752427
50%          37.775421
75%          37.784369
max          90.000000
Name: Y, dtype: float64

In [84]:
# Adjust to remove out of range X,Y values

df2 = df[df.X < -121]
df2 = df2[df.Y < 80]
df2.shape



(877982, 7)

In [85]:
print (df2.X.describe())
df2.Y.describe()

count    877982.000000
mean       -122.422763
std           0.025285
min        -122.513642
25%        -122.432952
50%        -122.416420
75%        -122.406959
max        -122.364937
Name: X, dtype: float64


count    877982.000000
mean         37.767035
std           0.024165
min          37.707879
25%          37.752427
50%          37.775421
75%          37.784368
max          37.819975
Name: Y, dtype: float64

In [86]:
df2['DOW'] = df2.Dates.dt.weekday

In [87]:
df2['DOM'] = df2.Dates.dt.day

In [88]:
df2['MONTH'] = df2.Dates.dt.month

In [89]:
df2['HOUR'] = df2.Dates.dt.hour

In [90]:
df2['YEAR'] = df2.Dates.dt.year

In [91]:
df2.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y,DOW,DOM,MONTH,HOUR,YEAR
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2,13,5,23,2015
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2,13,5,23,2015
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,2,13,5,23,2015
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873,2,13,5,23,2015
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541,2,13,5,23,2015


In [92]:
df_abbrev_train = df2[['PdDistrict', 'DOM', 'DOW', 'MONTH', 'HOUR','YEAR']]

In [93]:
df_label = df2.Category

In [94]:
df_abbrev_train.head()

Unnamed: 0,PdDistrict,DOM,DOW,MONTH,HOUR,YEAR
0,NORTHERN,13,2,5,23,2015
1,NORTHERN,13,2,5,23,2015
2,NORTHERN,13,2,5,23,2015
3,NORTHERN,13,2,5,23,2015
4,PARK,13,2,5,23,2015


In [95]:
df_PdD = pd.get_dummies(df2.PdDistrict)
df_DOM = pd.get_dummies(df2.DOM, prefix = 'DofM')
df_DOW = pd.get_dummies(df2.DOW, prefix = 'DofW')
df_MONTH = pd.get_dummies(df2.MONTH, prefix = 'Month')
df_HOUR = pd.get_dummies(df2.HOUR, prefix = 'Hour')

In [96]:
newdf = pd.concat([df_PdD, df_DOM], axis = 1)
newdf = pd.concat([newdf, df_DOW], axis = 1)
newdf = pd.concat([newdf, df_MONTH], axis = 1)
newdf = pd.concat([newdf, df_HOUR], axis = 1)
newdf.head()
newdf.shape

(877982, 84)

In [97]:
X_train, X_dev, y_train, y_dev = train_test_split(newdf, df_label, test_size = 0.3, random_state= 42)

In [98]:
print (X_train.shape)
print (X_dev.shape)

(614587, 84)
(263395, 84)


In [99]:
print (y_train.shape)
print (y_dev.shape)

(614587,)
(263395,)


In [70]:
lr = LogisticRegression( )
lr.fit(X_train, y_train)
lr.predict(X_dev)
print (metrics.classification_report(y_dev, lr.predict(X_dev)))

                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00       477
                    ASSAULT       0.16      0.03      0.05     22944
                 BAD CHECKS       0.00      0.00      0.00       124
                    BRIBERY       0.00      0.00      0.00        92
                   BURGLARY       0.16      0.00      0.00     11080
         DISORDERLY CONDUCT       0.00      0.00      0.00      1253
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       661
              DRUG/NARCOTIC       0.23      0.29      0.26     16297
                DRUNKENNESS       0.00      0.00      0.00      1288
               EMBEZZLEMENT       0.00      0.00      0.00       339
                  EXTORTION       0.00      0.00      0.00        81
            FAMILY OFFENSES       0.00      0.00      0.00       152
     FORGERY/COUNTERFEITING       0.00      0.00      0.00      3120
                      FRAUD      

  'precision', 'predicted', average, warn_for)


In [60]:
# train using MultinomialNB
nb1 = MultinomialNB()
%time nb1.fit(X_train, y_train)
%time y_pred = nb1.predict(X_dev)
print (metrics.classification_report(y_dev, y_pred))

Wall time: 5.76 s
Wall time: 156 ms
                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00       326
                    ASSAULT       0.16      0.03      0.06     15272
                 BAD CHECKS       0.00      0.00      0.00        87
                    BRIBERY       0.00      0.00      0.00        64
                   BURGLARY       0.18      0.00      0.00      7388
         DISORDERLY CONDUCT       0.00      0.00      0.00       853
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       409
              DRUG/NARCOTIC       0.24      0.28      0.26     10938
                DRUNKENNESS       0.00      0.00      0.00       847
               EMBEZZLEMENT       0.00      0.00      0.00       223
                  EXTORTION       0.00      0.00      0.00        50
            FAMILY OFFENSES       0.00      0.00      0.00        93
     FORGERY/COUNTERFEITING       0.00      0.00      0.00      20

  'precision', 'predicted', average, warn_for)


In [63]:
# bg1 = BaggingClassifier(n_estimators = 20)
# %time bg1.fit(X_train, y_train)
# %time y_pred = bg1.predict(X_dev)
# print (metrics.classification_report(y_dev, y_pred))

In [65]:
rf1 = RandomForestClassifier()
%time rf1.fit(X_train, y_train)
%time y_pred = rf1.predict(X_dev)
print (metrics.classification_report(y_dev, y_pred))
print (metrics.log_loss(y_dev, y_pred))

Wall time: 31.8 s
Wall time: 1.53 s
                             precision    recall  f1-score   support

                      ARSON       0.04      0.02      0.03       326
                    ASSAULT       0.14      0.15      0.14     15272
                 BAD CHECKS       0.00      0.00      0.00        87
                    BRIBERY       0.00      0.00      0.00        64
                   BURGLARY       0.07      0.06      0.06      7388
         DISORDERLY CONDUCT       0.02      0.01      0.02       853
DRIVING UNDER THE INFLUENCE       0.02      0.01      0.02       409
              DRUG/NARCOTIC       0.24      0.27      0.25     10938
                DRUNKENNESS       0.01      0.01      0.01       847
               EMBEZZLEMENT       0.02      0.01      0.01       223
                  EXTORTION       0.00      0.00      0.00        50
            FAMILY OFFENSES       0.03      0.02      0.03        93
     FORGERY/COUNTERFEITING       0.06      0.04      0.05      20

  'precision', 'predicted', average, warn_for)


In [51]:
# rf2 = RandomForestClassifier()
# %time rf2.fit(newdf, df_label)
# y_pred = rf2.predict(X_dev)
# print (metrics.classification_report(y_dev, y_pred))

Wall time: 30.5 s
                             precision    recall  f1-score   support

                      ARSON       0.59      0.41      0.48       321
                    ASSAULT       0.44      0.44      0.44     15364
                 BAD CHECKS       0.38      0.18      0.25        72
                    BRIBERY       0.37      0.12      0.18        58
                   BURGLARY       0.50      0.36      0.41      7389
         DISORDERLY CONDUCT       0.36      0.23      0.28       828
DRIVING UNDER THE INFLUENCE       0.48      0.27      0.35       459
              DRUG/NARCOTIC       0.44      0.53      0.49     10723
                DRUNKENNESS       0.35      0.18      0.24       857
               EMBEZZLEMENT       0.32      0.18      0.23       222
                  EXTORTION       0.27      0.16      0.20        51
            FAMILY OFFENSES       0.44      0.27      0.34        92
     FORGERY/COUNTERFEITING       0.39      0.26      0.31      2092
               

  'precision', 'predicted', average, warn_for)


## TRY SPLITTING ADDRESS BY GAUSSIAN MIXTURE

In [100]:
df2.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y,DOW,DOM,MONTH,HOUR,YEAR
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2,13,5,23,2015
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2,13,5,23,2015
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,2,13,5,23,2015
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873,2,13,5,23,2015
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541,2,13,5,23,2015


In [101]:
gmm2_df = df2.Category.groupby(df.Address).value_counts().unstack(level=0).fillna(value=0).T

gmm2_df.shape

(23191, 39)

In [102]:
addrxy = df2[['Address','X','Y']]

addrxy = addrxy.drop_duplicates(subset = 'Address')
addrxy = addrxy.sort_values(['Address'])
addrxy.tail(10)

Unnamed: 0,Address,X,Y
150190,YOSEMITE AV / LANE ST,-122.393531,37.727914
66568,YOSEMITE AV / MENDELL ST,-122.39544,37.728999
67224,YUKON ST / 19TH ST,-122.442177,37.75894
215794,YUKON ST / EAGLE ST,-122.442238,37.758104
151910,ZAMPA LN / GEARY BL,-122.430876,37.784609
685364,ZENO PL / FOLSOM ST,-122.393307,37.788226
284,ZOE ST / BRANNAN ST,-122.395309,37.779297
19639,ZOE ST / BRYANT ST,-122.396849,37.780528
21579,ZOE ST / FREELON ST,-122.395839,37.77972
102726,ZOE ST / WELSH ST,-122.396308,37.780097


In [103]:
from sklearn.mixture import GaussianMixture

gm1 = GaussianMixture(n_components = 12, n_init = 5)

newgmm = gm1.fit(gmm2_df).predict(gmm2_df)
gmm2_df['Addr_Group'] = newgmm
gmm2_df['Address'] = gmm2_df.index

In [107]:
newgroup = gmm2_df[['Address','Addr_Group']]

print (newgroup[newgroup.Address.str.contains('LOMBARD')].head(10), '\n')
print (newgroup.Addr_Group.value_counts())

Category                                   Address  Addr_Group
Address                                                       
0 Block of LOMBARD ST        0 Block of LOMBARD ST          11
100 Block of LOMBARD ST    100 Block of LOMBARD ST           4
1000 Block of LOMBARD ST  1000 Block of LOMBARD ST          11
1100 Block of LOMBARD ST  1100 Block of LOMBARD ST           4
1200 Block of LOMBARD ST  1200 Block of LOMBARD ST          11
1300 Block of LOMBARD ST  1300 Block of LOMBARD ST          11
1400 Block of LOMBARD ST  1400 Block of LOMBARD ST           6
1500 Block of LOMBARD ST  1500 Block of LOMBARD ST           6
1600 Block of LOMBARD ST  1600 Block of LOMBARD ST           4
1700 Block of LOMBARD ST  1700 Block of LOMBARD ST           6 

11    18367
6      2013
4      1979
10      458
0       347
3        12
5        10
9         1
8         1
7         1
2         1
1         1
Name: Addr_Group, dtype: int64


In [108]:
# create a map
mapper = newgroup.set_index('Address')['Addr_Group']
df2['Addr_Group'] = df2['Address'].map(mapper)

In [109]:
df2.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y,DOW,DOM,MONTH,HOUR,YEAR,Addr_Group
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2,13,5,23,2015,11
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2,13,5,23,2015,11
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,2,13,5,23,2015,11
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873,2,13,5,23,2015,6
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541,2,13,5,23,2015,4


In [110]:
df_PdD = pd.get_dummies(df2.PdDistrict)
df_DOM = pd.get_dummies(df2.DOM, prefix = 'DofM')
df_DOW = pd.get_dummies(df2.DOW, prefix = 'DofW')
df_MONTH = pd.get_dummies(df2.MONTH, prefix = 'Month')
df_HOUR = pd.get_dummies(df2.HOUR, prefix = 'Hour')
df_YEAR = pd.get_dummies(df2.YEAR, prefix = 'Hour')
df_Addr_Gr = pd.get_dummies(df2.Addr_Group, prefix = 'Addr')

new = pd.concat([df_PdD, df_DOM], axis = 1)
new = pd.concat([new, df_DOW], axis = 1)
new = pd.concat([new, df_MONTH], axis = 1)
new = pd.concat([new, df_HOUR], axis = 1)
new = pd.concat([new, df_YEAR], axis = 1)
new = pd.concat([new, df_Addr_Gr], axis = 1)
    
new.head()

Unnamed: 0,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,...,Addr_2,Addr_3,Addr_4,Addr_5,Addr_6,Addr_7,Addr_8,Addr_9,Addr_10,Addr_11
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [111]:
X_train, X_dev, y_train, y_dev = train_test_split(new, df2.Category, test_size = 0.3, random_state= 42)

In [112]:
rf1 = RandomForestClassifier()
%time rf1.fit(X_train, y_train)
%time y_pred = rf1.predict(X_dev)
print (metrics.classification_report(y_dev, y_pred))
# print (metrics.log_loss(y_dev, y_pred))

Wall time: 36.5 s
Wall time: 2.63 s
                             precision    recall  f1-score   support

                      ARSON       0.05      0.03      0.03       477
                    ASSAULT       0.17      0.19      0.18     22944
                 BAD CHECKS       0.00      0.00      0.00       124
                    BRIBERY       0.00      0.00      0.00        92
                   BURGLARY       0.09      0.07      0.08     11080
         DISORDERLY CONDUCT       0.03      0.03      0.03      1253
DRIVING UNDER THE INFLUENCE       0.03      0.02      0.02       661
              DRUG/NARCOTIC       0.31      0.36      0.33     16297
                DRUNKENNESS       0.01      0.01      0.01      1288
               EMBEZZLEMENT       0.00      0.00      0.00       339
                  EXTORTION       0.00      0.00      0.00        81
            FAMILY OFFENSES       0.06      0.04      0.05       152
     FORGERY/COUNTERFEITING       0.10      0.08      0.09      31

  'precision', 'predicted', average, warn_for)


In [113]:
lr = LogisticRegression( )
lr.fit(X_train, y_train)
lr.predict(X_dev)
print (metrics.classification_report(y_dev, lr.predict(X_dev)))

                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00       477
                    ASSAULT       0.18      0.06      0.09     22944
                 BAD CHECKS       0.00      0.00      0.00       124
                    BRIBERY       0.00      0.00      0.00        92
                   BURGLARY       0.18      0.00      0.01     11080
         DISORDERLY CONDUCT       0.00      0.00      0.00      1253
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       661
              DRUG/NARCOTIC       0.28      0.34      0.31     16297
                DRUNKENNESS       0.00      0.00      0.00      1288
               EMBEZZLEMENT       0.00      0.00      0.00       339
                  EXTORTION       0.00      0.00      0.00        81
            FAMILY OFFENSES       0.00      0.00      0.00       152
     FORGERY/COUNTERFEITING       0.20      0.00      0.00      3120
                      FRAUD      

  'precision', 'predicted', average, warn_for)


## CREATE OUTPUT FOR KAGGLE

In [115]:
# def process_data(df):
#     df['DOW'] = df.Dates.dt.weekday
#     df['DOM'] = df.Dates.dt.day
#     df['MONTH'] = df.Dates.dt.month
#     df['HOUR'] = df.Dates.dt.hour
    
#     df_PdD = pd.get_dummies(df.PdDistrict)
#     df_DOM = pd.get_dummies(df.DOM, prefix = 'DofM')
#     df_DOW = pd.get_dummies(df.DOW, prefix = 'DofW')
#     df_MONTH = pd.get_dummies(df.MONTH, prefix = 'Month')
#     df_HOUR = pd.get_dummies(df.HOUR, prefix = 'Hour')

#     new = pd.concat([df_PdD, df_DOM], axis = 1)
#     new = pd.concat([new, df_DOW], axis = 1)
#     new = pd.concat([new, df_MONTH], axis = 1)
#     new = pd.concat([new, df_HOUR], axis = 1)
    
#     return new

In [116]:
# df_test = pd.read_csv('test.csv', parse_dates=['Dates'])
# df_test.head()

In [117]:
# Process test data and predict outcomes

# test = process_data(df_test)
# results = rf2.predict(test)

# results[0:5]

In [118]:
# df_results = pd.get_dummies(results)
# df_results.to_csv('submission1_1_vhe.csv')