In [34]:
def get_one_hot_encodings(df, cols):
    result = pd.DataFrame()
    i = 0
    for col in cols:
        dummies = pd.get_dummies(df[col],prefix=col)
        if( i == 0 ):
            result = dummies.copy()
        else:
            result = pd.concat((result, dummies), axis=1)
        i+=1
    return result

def reduce_features(df, verbose = False):
    # calculate the correlation matrix
    corr_matrix  = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    
    #Get all of the correlation values > 95%
    x = np.where(upper > 0.95)

    #Display all field combinations with > 95% correlation
    cf = pd.DataFrame()
    cf['Field1'] = upper.columns[x[1]]
    cf['Field2'] = upper.index[x[0]]

    #Get the correlation values for every field combination. (There must be a more pythonic way to do this!)
    corr = [0] * len(cf)
    for i in range(0, len(cf)):
        corr[i] =  upper[cf['Field1'][i]][cf['Field2'][i]] 

    cf['Correlation'] = corr

    if( verbose ):
        print('There are ', str(len(cf['Field1'])), ' field correlations > 95%.')
        display(cf)
        
        print('Dropping the following ', str(len(to_drop)), ' highly correlated fields.')
        to_drop
        
    #Check columns before drop 
    if( verbose ):
        print('\r\n*********Before: Dropping Highly Correlated Fields*************************************')
        display(df.info(verbose=False))

    # Drop the highly correlated features from our training data 
    df = df.drop(to_drop, axis=1)

    #Check columns after drop 
    if( verbose ):
        print('\r\n*********After: Dropping Highly Correlated Fields**************************************')
        df.info(verbose=False)
    
    return df


In [26]:
import numpy as np
import pandas as pd

df = pd.read_csv("../data/chicagoCrimes10k.csv")
df = df.dropna().copy()
strs = [
#     "Block",
    "IUCR",
    "Primary Type",
#     "Description",
#     "Location Description",

    "Beat",
    "District",
    "Ward",
    "Community Area",
    "FBI Code"
]

#     "Domestic",
#     "X Coordinate",
#     "Y Coordinate",
#     "Latitude",
#     "Longitude"

crimes = get_one_hot_encodings(df, strs)

print("columns before reducer",crimes.shape)
crimes = reduce_features(crimes)
print("columns after reducer",crimes.shape)

crimes = pd.concat([
        crimes,
        df[["Domestic","X Coordinate","Y Coordinate","Latitude","Longitude"]]
    ], axis=1)

print("Final Size", crimes.shape)
arrests = df["Arrest"].values
print("Response",arrests[0:10])

columns before reducer (9832, 688)
columns after reducer (9832, 667)
Final Size (9832, 672)
Response [ True  True  True False False False  True False False False]


Response [ True  True  True  True False False False  True False False]


In [31]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

X = crimes.values
y = arrests

# Creat eth Cross Validation Objected used for all tests
num_cv_iterations = 5
random_st = 42
kfold_cv = KFold(
    n_splits=num_cv_iterations,
    random_state = random_st
)

clf = RandomForestClassifier(random_state=42)

grid_params = [{
    "max_features" : ["auto","log2",0.20, 0.30],
    "n_estimators" : [10,50,100],
    "min_samples_leaf" : [25, 50, 100]
}]

grid_clf = GridSearchCV(
    estimator = clf, 
    param_grid=grid_params, 
    cv=kfold_cv,
    refit='auc',
    n_jobs=-2,verbose=1,return_train_score=False)

grid_clf.fit(X, y)

# Save Model
filename = '../model/HasDetections_GridSearch_RF_final.pkl'
pickle.dump(grid_clf, open(filename, 'wb'))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-2)]: Done 180 out of 180 | elapsed:  2.5min finished


In [41]:
import pickle
grid_clf = pickle.load(open("../model/HasDetections_GridSearch_RF_final.pkl", 'rb'))

print("GridSearchCV for "+ str( len(grid_clf.cv_results_['params']) ) +" candidate parameter settings.")

print( grid_clf.best_params_ )
# print( grid_clf.cv_results_ )

print( "Accuracy_Mean: ",grid_clf.cv_results_["mean_test_score"][grid_clf.best_index_] )
print( "Accuracy_Std: ",grid_clf.cv_results_["std_test_score"][grid_clf.best_index_] )


GridSearchCV for 36 candidate parameter settings.
{'max_features': 0.2, 'min_samples_leaf': 25, 'n_estimators': 10}
Accuracy_Mean:  0.8807973962571196
Accuracy_Std:  0.001719669028223565
