In [1]:
import pandas as pd
import numpy as np

In [2]:
crime = pd.read_csv("train.csv", parse_dates = ['Dates'])

In [3]:
crime['Category'].replace('TREA', 'TRESPASS', inplace=True)

In [3]:
# Delete most categories.

drop = ["NON-CRIMINAL", "MISSING PERSON",
        "SUSPICIOUS OCC", "SUICIDE", "RECOVERED VEHICLE", "OTHER OFFENSES",
        "BAD CHECKS", "BRIBERY", "EMBEZZLEMENT", "EXTORTION", "FAMILY OFFENSES", "GAMBLING",
        "PORNOGRAPHY/OBSCENE MAT", "SECONDARY CODES", "SEX OFFENSES NON FORCIBLE", "ARSON",
        "DISORDERLY CONDUCT", "DRIVING UNDER THE INFLUENCE", "WEAPON LAWS", "LIQUOR LAWS",
        "LOITERING","RUNAWAY","WARRANTS","FORGERY/COUNTERFEITING","KIDNAPPING","FRAUD","VANDALISM" ]
crime = crime[~crime['Category'].isin(drop)]

In [4]:
# Three categories only: assault, drug and theft-related.

crime.replace(["ASSAULT",'SEX OFFENSES FORCIBLE'],"assault",inplace=True)

crime.replace(["DRUG/NARCOTIC",'DRUNKENNESS', "PROSTITUTION"],"drug/drunkenness/prostitution", inplace=True)

crime.replace(["BURGLARY",'LARCENY/THEFT', "STOLEN PROPERTY", "TRESPASS", "ROBBERY",
              "THEFT", "VEHICLE THEFT"],"theft", inplace=True)


In [5]:
crime.groupby(crime['Category']).size()

Category
TREA                                  6
assault                           81264
drug/drunkenness/prostitution     65735
theft                            300302
dtype: int64

In [6]:
dow = pd.get_dummies(crime['DayOfWeek'])
district = pd.get_dummies(crime['PdDistrict'])

X = pd.concat((dow, district), axis=1)
#X = district
y = crime['Category']

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42)

In [9]:
clf_tree = DecisionTreeClassifier()
clf_tree = clf_tree.fit(X_train, y_train)
y_predicted = clf_tree.predict(X_test)

In [10]:
accuracy_score(y_predicted, y_test)


0.6799087880887975

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')


In [12]:
# I am not able to run pipe_svm. My laptop keeps freezing.

pipe_dt = Pipeline([('scl', StandardScaler()),          
                    ('pca', PCA()),      
                    ('clf', DecisionTreeClassifier())])

pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression())])

pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', SVC())])

In [13]:
# The grid search becomes extremely slow as you add parameters.

dt_grid_params = dict(pca__n_components=[4,5])

gs = GridSearchCV(estimator=pipe_dt,  
                  param_grid=dt_grid_params,
                  scoring='accuracy',
                  cv=4)

gs.fit(X_train, y_train)
f"{gs.score(X_test, y_test):.4f}"

'0.6799'

In [14]:
gs.best_params_ 

{'pca__n_components': 5}