In [65]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf
import datetime 

from sklearn import feature_selection, linear_model, metrics, preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns
import seaborn as sb


In [66]:
df_12 = pd.read_csv('NIJ2012.csv')
df_13 = pd.read_csv('NIJ2013.csv')
df_14 = pd.read_csv('NIJ2014.csv')
df_15 = pd.read_csv('NIJ2015.csv')
df_16_01_07 = pd.read_csv('NIJ2016_01_07.csv')
df_16_08 = pd.read_csv('NIJ2016_08.csv')
df_16_09 = pd.read_csv('NIJ2016_09.csv')
df_16_10 = pd.read_csv('NIJ2016_10.csv')

In [67]:
df = pd.concat([df_12, df_13, df_14, df_15, df_16_01_07, df_16_08, df_16_09, df_16_10],ignore_index = True)

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893954 entries, 0 to 893953
Data columns (total 8 columns):
CATEGORY           893954 non-null object
CALL GROUPS        893954 non-null object
final_case_type    893954 non-null object
CASE DESC          893954 non-null object
occ_date           893954 non-null object
x_coordinate       893954 non-null int64
y_coordinate       893954 non-null int64
census_tract       841870 non-null float64
dtypes: float64(1), int64(2), object(5)
memory usage: 54.6+ MB


In [69]:
df = df.drop('census_tract', 1)

In [70]:
df.rename(columns = {'CASE DESC': 'CASE_DESC', 'final_case_type': 'FINAL_CASE_TYPE','occ_date': 'OCC_DATE','y_coordinate': 'Y_COORDINATE','x_coordinate': 'X_COORDINATE','CALL GROUPS': 'CALL_GROUPS'}, inplace = True)

In [73]:
np.sort(df.CALL_GROUPS.unique())

array([' SUSPICIOUS', 'DISORDER', 'NON CRIMINAL/ADMIN', 'PERSON CRIME',
       'PROPERTY CRIME', 'TRAFFIC'], dtype=object)

In [74]:
df.OCC_DATE = pd.to_datetime(df.OCC_DATE)

In [75]:
df.set_index('OCC_DATE', inplace = True)
df['YEAR'] = df.index.year
df['MONTH'] = df.index.month
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,CATEGORY,CALL_GROUPS,FINAL_CASE_TYPE,CASE_DESC,X_COORDINATE,Y_COORDINATE,YEAR,MONTH
0,STREET CRIMES,DISORDER,DISTP,DISTURBANCE - PRIORITY ...,7641076,684831,2012,3
1,STREET CRIMES,DISORDER,DISTP,DISTURBANCE - PRIORITY ...,7642640,683167,2012,3
2,STREET CRIMES,DISORDER,DISTP,DISTURBANCE - PRIORITY ...,7643599,683216,2012,3
3,STREET CRIMES,DISORDER,DISTP,DISTURBANCE - PRIORITY ...,7644359,693642,2012,3
4,STREET CRIMES,DISORDER,DISTP,DISTURBANCE - PRIORITY ...,7644771,683859,2012,3


In [76]:
df.drop(['FINAL_CASE_TYPE', 'CASE_DESC'],
    axis = 1,
    inplace = True)
df.head()

Unnamed: 0,CATEGORY,CALL_GROUPS,X_COORDINATE,Y_COORDINATE,YEAR,MONTH
0,STREET CRIMES,DISORDER,7641076,684831,2012,3
1,STREET CRIMES,DISORDER,7642640,683167,2012,3
2,STREET CRIMES,DISORDER,7643599,683216,2012,3
3,STREET CRIMES,DISORDER,7644359,693642,2012,3
4,STREET CRIMES,DISORDER,7644771,683859,2012,3


In [77]:
# save progress to csv
df.to_csv('2012-16.csv', sep=',', index=False)

In [78]:
# load saved csv
df = pd.read_csv('2012-16.csv')

In [79]:
df.columns

Index([u'CATEGORY', u'CALL_GROUPS', u'X_COORDINATE', u'Y_COORDINATE', u'YEAR',
       u'MONTH'],
      dtype='object')

In [80]:
# encoded labels
labeldict = dict(zip(df['CATEGORY'].unique(), 
                     range(len(df['CATEGORY'].unique()))))
labeldict

{'BURGLARY': 3, 'MOTOR VEHICLE THEFT': 2, 'OTHER': 1, 'STREET CRIMES': 0}

In [81]:
labeldict = dict(zip(df['CALL_GROUPS'].unique(), 
                    range(len(df['CALL_GROUPS'].unique()))))
labeldict
df['CALL_GROUPS_LBL'] = df['CALL_GROUPS'].map(labeldict)

In [82]:
YEAR_df = pd.get_dummies(df.YEAR, prefix = 'YEAR')
MONTH_df = pd.get_dummies(df.MONTH, prefix = 'MONTH')
CALL_GROUPS_df = pd.get_dummies(df.CALL_GROUPS, prefix = 'CALL_GROUPS')

In [83]:
df = df.join([YEAR_df, MONTH_df, CALL_GROUPS_df])
df.head()

Unnamed: 0,CATEGORY,CALL_GROUPS,X_COORDINATE,Y_COORDINATE,YEAR,...,CALL_GROUPS_DISORDER,CALL_GROUPS_NON CRIMINAL/ADMIN,CALL_GROUPS_PERSON CRIME,CALL_GROUPS_PROPERTY CRIME,CALL_GROUPS_TRAFFIC
0,STREET CRIMES,DISORDER,7641076,684831,2012,...,1.0,0.0,0.0,0.0,0.0
1,STREET CRIMES,DISORDER,7642640,683167,2012,...,1.0,0.0,0.0,0.0,0.0
2,STREET CRIMES,DISORDER,7643599,683216,2012,...,1.0,0.0,0.0,0.0,0.0
3,STREET CRIMES,DISORDER,7644359,693642,2012,...,1.0,0.0,0.0,0.0,0.0
4,STREET CRIMES,DISORDER,7644771,683859,2012,...,1.0,0.0,0.0,0.0,0.0


In [84]:
df.drop(['YEAR', 'MONTH','CALL_GROUPS'], axis=1, inplace=True)
df.columns

Index([u'CATEGORY', u'X_COORDINATE', u'Y_COORDINATE', u'CALL_GROUPS_LBL',
       u'YEAR_2012', u'YEAR_2013', u'YEAR_2014', u'YEAR_2015', u'YEAR_2016',
       u'MONTH_1', u'MONTH_2', u'MONTH_3', u'MONTH_4', u'MONTH_5', u'MONTH_6',
       u'MONTH_7', u'MONTH_8', u'MONTH_9', u'MONTH_10', u'MONTH_11',
       u'MONTH_12', u'CALL_GROUPS_ SUSPICIOUS', u'CALL_GROUPS_DISORDER',
       u'CALL_GROUPS_NON CRIMINAL/ADMIN', u'CALL_GROUPS_PERSON CRIME',
       u'CALL_GROUPS_PROPERTY CRIME', u'CALL_GROUPS_TRAFFIC'],
      dtype='object')

In [63]:
#df.info()

In [85]:
df['CATEGORY'].loc[df['CATEGORY']=='OTHER'].value_counts()

OTHER    732618
Name: CATEGORY, dtype: int64

In [86]:
train_X = df[df['YEAR_2016']==0].drop('CATEGORY', axis=1, inplace=False)
train_Y = df[df['YEAR_2016']==0]['CATEGORY']
test_X = df[df['YEAR_2016']==1].drop('CATEGORY', axis=1, inplace=False)
test_Y = df[df['YEAR_2016']==1]['CATEGORY']

RANDOM FOREST

In [87]:
model = RandomForestClassifier(n_estimators=1000)

FITTING DATA

In [88]:
model.fit(train_X, train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

TRAINING DATA

In [89]:
metrics.accuracy_score(train_Y, model.predict(train_X))

0.9714156383186161

In [90]:
metrics.accuracy_score(test_Y, model.predict(test_X))

0.84819348312818943

In [36]:
print test_X.iloc[0]
print model.predict(test_X.iloc[0].values.reshape(1,-1))

X_COORDINATE                      7624068.0
Y_COORDINATE                       710192.0
CALL_GROUPS_LBL                         0.0
YEAR_2012                               0.0
YEAR_2013                               0.0
                                    ...    
CALL_GROUPS_DISORDER                    1.0
CALL_GROUPS_NON CRIMINAL/ADMIN          0.0
CALL_GROUPS_PERSON CRIME                0.0
CALL_GROUPS_PROPERTY CRIME              0.0
CALL_GROUPS_TRAFFIC                     0.0
Name: 717980, dtype: float64
['STREET CRIMES']
