In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
from sklearn import preprocessing

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
from sklearn.pipeline import Pipeline, FeatureUnion

In [6]:
def format_column_name(df):
    """ replace ' ' with '_' in all column names.
    param:
    -----
        df: DataFrame.columns
        
    returnL
        column_names_format: DataFrame.columns, formated column names.
    """
    column_names_format = df.columns.str.replace(' ', '_')
    return column_names_format

In [7]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [8]:
train = pd.read_csv('NYPD_7_Major_Felony_Incidents_train.csv')

In [9]:
test = pd.read_csv('NYPD_7_Major_Felony_Incidents_test.csv')

In [11]:
train.columns = format_column_name(train)

In [12]:
test.columns = format_column_name(test)

In [13]:
cat_attributes = ['Day_of_Week', 'Occurrence_Month', 'Sector', 'Borough']

In [14]:
num_attributes = ['Occurrence_Day', 'Occurrence_Year', 'Occurrence_Hour', 'CompStat_Month', 'CompStat_Day']

In [150]:
train.head(5)

Unnamed: 0,Identifier,Occurrence_Datetime,Day_of_Week,Occurrence_Month,Occurrence_Day,Occurrence_Year,Occurrence_Hour,CompStat_Month,CompStat_Day,CompStat_Year,Offense,Sector,Precinct,Borough,Jurisdiction,XCoordinate,YCoordinate,Location_1,Occurrence_Date
0,4eaf2b62,02/13/2013 12:00:00 AM,Wednesday,Feb,13,2013,0,2,14,2013,GRAND LARCENY,H,13,MANHATTAN,N.Y. POLICE DEPT,985716,209911,"(40.7428419120001, -73.9947109889999)",2013-02-13
1,cacec67c,02/13/2013 12:00:00 AM,Wednesday,Feb,13,2013,0,5,20,2013,GRAND LARCENY,I,52,BRONX,N.Y. POLICE DEPT,1016552,260706,"(40.88220104, -73.88318653)",2013-02-13
2,ca4bc93e,02/13/2013 12:01:00 AM,Wednesday,Feb,13,2013,0,2,13,2013,FELONY ASSAULT,A,100,QUEENS,N.Y. POLICE DEPT,1041165,155066,"(40.592122008, -73.795071599)",2013-02-13
3,19f9636c,02/13/2013 12:05:00 AM,Wednesday,Feb,13,2013,0,2,13,2013,FELONY ASSAULT,H,62,BROOKLYN,N.Y. POLICE DEPT,984298,163775,"(40.61620917, -73.999828549)",2013-02-13
4,629302ce,02/13/2013 12:05:00 AM,Wednesday,Feb,13,2013,0,2,13,2013,FELONY ASSAULT,D,52,BRONX,N.Y. POLICE DEPT,1013228,253750,"(40.863120505, -73.8952373069999)",2013-02-13


In [15]:
import numpy as np

#### Dataset Processing

1. Imbalanced data set

In [225]:
class_stat = pd.DataFrame({'crime_type': train['Offense'].unique(), 
                   'num': train['Offense'].value_counts().tolist()})

In [226]:
class_stat['ratio'] = class_stat.num/train.shape[0]

In [227]:
max_class_size = class_stat.num.max()

In [236]:
minor_class = class_stat.crime_type.loc[class_stat.num < max_class_size].tolist()

In [265]:
minor_df_list = []

In [239]:
from sklearn.utils import resample

In [267]:
for item in minor_class:
    temp_minor_upsample_df = resample(train[train.Offense == item],
                                      replace=True,
                                      n_samples = max_class_size,
                                      random_state = 666)
    minor_df_list.append(temp_minor_upsample_df)

In [269]:
df_major = train[train.Offense == class_stat.crime_type[class_stat.num == max_class_size].item()]

In [270]:
minor_df_list.append(df_major)

In [272]:
df_upsampled = pd.concat(minor_df_list)

In [273]:
df_upsampled.reset_index(drop=True, inplace=True)

__note:__ After using resampling technique, the training set is balanced.

In [274]:
df_upsampled.Offense.value_counts()

GRAND LARCENY                     87261
RAPE                              87261
ROBBERY                           87261
GRAND LARCENY OF MOTOR VEHICLE    87261
FELONY ASSAULT                    87261
MURDER & NON-NEGL. MANSLAUGHTE    87261
BURGLARY                          87261
Name: Offense, dtype: int64

#### Feature Engineering

In [275]:
class MultiColumnEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoder):
        self.encoder = encoder
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        feature_list = []
        for i in range(X.shape[1]):
            feature_list.append(self.encoder.fit_transform(X[:, i]))
        
        if isinstance(self.encoder, preprocessing.LabelBinarizer):
            return np.concatenate(feature_list, axis = 1)
        else:
            return np.mat(feature_list).transpose()

In [276]:
cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attributes)),
                         ('encoder', MultiColumnEncoder(preprocessing.LabelEncoder()))])

In [156]:
num_pipeline = Pipeline([('selector', DataFrameSelector(num_attributes)),
                         ('scaler', preprocessing.StandardScaler())])

In [277]:
num_pipeline = Pipeline([('selector', DataFrameSelector(num_attributes)),])

In [278]:
full_pipeline = FeatureUnion(transformer_list=[('cat_pipeline', cat_pipeline),
                                               ('num_pipeline', num_pipeline)])

In [279]:
X = full_pipeline.fit_transform(df_upsampled)

In [280]:
label_encoder = preprocessing.LabelEncoder()

In [282]:
y = label_encoder.fit_transform(df_upsampled['Offense'].values)

In [286]:
test_X = full_pipeline.fit_transform(test)

In [287]:
test_y = label_encoder.fit_transform(test['Offense'].values)

In [312]:
random_forest = RandomForestClassifier(max_features=0.8)

In [313]:
random_forest.fit(X=X, y=y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [314]:
y_train_pred = random_forest.predict(X=X)
sklearn.metrics.accuracy_score(y_pred=y_train_pred, y_true=y)

0.97827044318604117

In [315]:
y_test_pred = random_forest.predict(X=test_X) 

In [316]:
pred_series = pd.DataFrame({'y_pred': y_test_pred, 'y_true': test_y})

In [317]:
pred_series['is_right'] = (pred_series.y_pred == pred_series.y_true)

In [320]:
sklearn.metrics.accuracy_score(y_pred=y_test_pred, y_true=test_y)

0.30359148699291244

In [50]:
from sklearn.model_selection import GridSearchCV

In [None]:
range(np.sqrt(17))

In [2]:
import numpy as np

In [9]:
list(range(np.int(np.sqrt(17)), 17))

[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [99]:
param_grid = [{'n_estimators': [3, 10, 30],
               'max_features': [2, 4, 6, 8],
               'n_jobs': [-1]}，
              {'bootstrap': [False], 
               'n_estimators': [3, 10], 
               'max_features': [5, 7, 9],
               'n_jobs': [-1]}]

In [94]:
random_forest = RandomForestClassifier()

In [100]:
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=10, scoring='accuracy')

In [101]:
grid_search.fit(X, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8], 'n_jobs': [-1]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [5, 7, 9], 'n_jobs': [-1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [82]:
sample_estimator.fit(X=X, y=y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [151]:
best_est = grid_search.best_estimator_

In [113]:
best_est.feature_importances_

array([ 0.08460287,  0.07717192,  0.19706878,  0.03416678,  0.15171369,
        0.03840877,  0.15050104,  0.07888611,  0.18748004])

In [148]:
res = pd.DataFrame(grid_search.cv_results_)

In [166]:
grid_search.best_score_

0.38083382860137865

In [163]:
y_test_pred = best_est.predict(test_X)

In [165]:
sklearn.metrics.accuracy_score(y_pred=y_test_pred, y_true=test_y)

0.35203133231940675