In [1]:
import pandas as pd
import numpy as np

In [2]:
crime = pd.read_csv("data/train.csv", parse_dates = ['Dates'])

# Feature engineering

In [3]:
crime.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
crime['Daypart'] = pd.cut(crime.Dates.dt.hour, bins=4, labels=labels)
crime.Daypart.value_counts()

Afternoon    291251
Evening      281135
Morning      179815
Night        125848
Name: Daypart, dtype: int64

In [8]:
import holidays

In [9]:
us_holidays = [date.strftime('%m-%d-%Y') for date in 
               list(holidays.US(years=range(2003, 2016)).keys())]
us_holidays[:5]

['01-01-2003', '01-20-2003', '02-17-2003', '05-26-2003', '07-04-2003']

In [10]:
# Did the crime occur on a holiday?
crime['Dt'] = pd.to_datetime(crime['Dates'].dt.date)
crime['Is_Holiday'] = np.where(crime['Dt'].isin(us_holidays), 1, 0)

In [11]:
crime.Is_Holiday.value_counts()

0    850015
1     28034
Name: Is_Holiday, dtype: int64

In [12]:
# Did the crime occur at a street corner?
crime['Is_Corner'] = np.where(crime['Address'].str.contains("/"), 1, 0)

In [13]:
crime.groupby(crime['Is_Corner']).size()

Is_Corner
0    617231
1    260818
dtype: int64

In [14]:
# Was the crime resolved?
crime['Is_Resolved'] = np.where(crime['Resolution'] != "NONE", 1, 0)

In [15]:
crime.groupby(crime['Is_Resolved']).size()

Is_Resolved
0    526790
1    351259
dtype: int64

In [16]:
crime.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Daypart,Dt,Is_Holiday,Is_Corner,Is_Resolved
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,Evening,2015-05-13,0,1,1
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,Evening,2015-05-13,0,1,1
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,Evening,2015-05-13,0,1,1
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,Evening,2015-05-13,0,0,0
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,Evening,2015-05-13,0,0,0


# Target engineering

In [17]:
crime['Category'].replace('TREA', 'TRESPASS', inplace=True)

In [18]:
# Delete most categories.

drop = ["NON-CRIMINAL", "MISSING PERSON",
        "SUSPICIOUS OCC", "SUICIDE", "RECOVERED VEHICLE", "OTHER OFFENSES",
        "BAD CHECKS", "BRIBERY", "EMBEZZLEMENT", "EXTORTION", "FAMILY OFFENSES", "GAMBLING",
        "PORNOGRAPHY/OBSCENE MAT", "SECONDARY CODES", "SEX OFFENSES NON FORCIBLE", "ARSON",
        "DISORDERLY CONDUCT", "DRIVING UNDER THE INFLUENCE", "WEAPON LAWS", "LIQUOR LAWS",
        "LOITERING","RUNAWAY","WARRANTS","FORGERY/COUNTERFEITING","KIDNAPPING","FRAUD","VANDALISM" ]
crime = crime[~crime['Category'].isin(drop)]

In [19]:
# Three categories only: assault, drug and theft-related.

crime.replace(["ASSAULT",'SEX OFFENSES FORCIBLE'], "violent",inplace=True)

crime.replace(["DRUG/NARCOTIC",'DRUNKENNESS', "PROSTITUTION"], "deviant", inplace=True)

crime.replace(["BURGLARY",'LARCENY/THEFT', "STOLEN PROPERTY", "TRESPASS", "ROBBERY",
              "THEFT", "VEHICLE THEFT"], "theft", inplace=True)

In [20]:
crime.Category.value_counts()

theft      300308
violent     81264
deviant     65735
Name: Category, dtype: int64

In [21]:
dow = pd.get_dummies(crime['DayOfWeek'])
district = pd.get_dummies(crime['PdDistrict'])
dummies = pd.concat((dow, district), axis=1)

In [22]:
crime = pd.concat((crime, dummies), axis=1)
X = crime.iloc[:, np.r_[7:10, 11:31]]
y = crime.Category

# Fit models

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [24]:
enc = LabelEncoder()
X.Daypart = enc.fit_transform(X.Daypart)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                    random_state=42)

In [26]:
clf_tree = DecisionTreeClassifier()
clf_tree = clf_tree.fit(X_train, y_train)
y_predicted = clf_tree.predict(X_test)

In [27]:
accuracy_score(y_predicted, y_test)

0.7497037848471977

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')


In [29]:
# I am not able to run pipe_svm. My laptop keeps freezing.

pipe_dt = Pipeline([('scl', StandardScaler()),          
                    ('pca', PCA()),      
                    ('clf', DecisionTreeClassifier())])

pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression())])

pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', SVC())])

In [30]:
# The grid search becomes extremely slow as you add parameters.

dt_grid_params = dict(pca__n_components=[4,5])

gs = GridSearchCV(estimator=pipe_dt,  
                  param_grid=dt_grid_params,
                  scoring='accuracy',
                  cv=4)

gs.fit(X_train, y_train)
f"{gs.score(X_test, y_test):.4f}"     ### wierd it shows accuracy 0.6033 and best n_compo = 4 on my laptop I'm pretty sure it's in ml environment

'0.6159'

In [35]:
gs.best_params_ 

{'pca__n_components': 4}

In [32]:
from sklearn.ensemble import RandomForestClassifier

pipe_rf = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA()),
                    ('clf', RandomForestClassifier())])
rf_grid_params = dict(pca__n_components=range(4,7))    ### tried range(5,8) -> n=5 accuracy 0.70

gs_rf = GridSearchCV(estimator = pipe_rf,  
                  param_grid = rf_grid_params,
                  scoring = 'accuracy',
                  cv=4)
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'pca__n_components': range(4, 7)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [33]:
gs_rf.score(X_test, y_test)

0.6994701661040442

In [34]:
gs_rf.best_params_ 

{'pca__n_components': 4}

In [58]:
## Random Forest for features selection

In [36]:
rf =  RandomForestClassifier()
rf.fit(X_train, y_train)
rf_acc = rf.score(X_test, y_test)
print(f"{rf_acc:.4f}")


0.7611


In [37]:
rf.feature_importances_

array([0.30468821, 0.31074491, 0.05208621, 0.00521597, 0.01970703,
       0.23733026, 0.00403098, 0.00417822, 0.0039242 , 0.00360324,
       0.00429182, 0.00394575, 0.00363107, 0.00249783, 0.00231136,
       0.00237845, 0.00449866, 0.00225799, 0.00121754, 0.00100784,
       0.00355474, 0.00108464, 0.02181308])

In [75]:
rf.feature_importances_

array([0.30468821, 0.31074491, 0.05208621, 0.00521597, 0.01970703,
       0.23733026, 0.00403098, 0.00417822, 0.0039242 , 0.00360324,
       0.00429182, 0.00394575, 0.00363107, 0.00249783, 0.00231136,
       0.00237845, 0.00449866, 0.00225799, 0.00121754, 0.00100784,
       0.00355474, 0.00108464, 0.02181308])

In [76]:
d=dict()
for name,n in zip(list(crime.columns),rf.feature_importances_):
    d[name]=n
features_list=list(d.items())      
features_list.sort(key=lambda x: -x[1])
features_most_important_to_least = [f[0] for f in features_list]