In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import random
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

In [None]:
# load data
train_set = pd.DataFrame.from_csv('sqf_train_cpw.csv', index_col = False)
test_set = pd.DataFrame.from_csv('sqf_test_cpw.csv', index_col = False)


In [None]:
# Explore the data a bit
# feature names and number of rows/cols
print train_set.columns
print train_set.shape

# distribution of various features in the training set
train_set['year'].value_counts()
train_set['found.weapon'].value_counts()
train_set['precinct'].value_counts()
train_set['suspect.race'].value_counts()

# proportion of features in the training set
train_set['suspect.race'].value_counts() / np.sum(train_set['suspect.race'].value_counts())
train_set['suspect.sex'].value_counts() / np.sum(train_set['suspect.sex'].value_counts())

# Warmup question:
# which precinct in the training set has the highest percentage of successful stops, i.e. stops where found.weapon==True?

temp = train_set.groupby('precinct')
precinct_list = []
for name, group in temp:
    precinct_list.append((name, np.mean(group['found.weapon'])))

print sorted(precinct_list, reverse=True, key=lambda x: x[1])



In [None]:
# join and re-split data to get all categories
train_set['set'] = 'train'
test_set['set'] = 'test'
joined_data = train_set.append(test_set)

# select all non-real-valued columns (besides 'set' and 'id') and convert to one-hot encoding
col_names = joined_data.columns
col_names = col_names.difference(['id', 'set', 'suspect.age', 'suspect.weight', 'suspect.height', 'observation.period'])
joined_data = pd.get_dummies(data=joined_data, columns=col_names, sparse=True)


In [None]:
# what does the data look like after converting to one-hot encoding?
list(joined_data.columns)

In [None]:
# remove redundant columns (binary columns of the form 'variable_False')
redundant_cols = []
for name in list(joined_data):
    if "False" in name:
        redundant_cols.append(name)
joined_data.drop(redundant_cols, inplace=True, axis=1)

# verify that redundant columns have been removed
list(joined_data.columns)

In [None]:
# split data again
train = joined_data.loc[joined_data['set'] == 'train']
train = train.drop(['set'], axis=1)
test = joined_data.loc[joined_data['set'] == 'test']
test = test.drop(['set'], axis=1)

In [None]:
# split training data into features and outcome (numpy arrays, to feed to sklearn algorithms)
label_train = np.ravel(train[['found.weapon_True']].values)
pred_train = train.drop(['id', 'arrested_True', 'found.weapon_True', 'found.gun_True'], axis=1)

print pred_train.head()
pred_train = pred_train.values 


In [None]:
# format test data
results = test.copy()
label_test = np.ravel(test[['found.weapon_True']].values)
pred_test = test.drop(['id', 'found.weapon_True', 'arrested_True', 'found.gun_True'], axis=1)
feature_names = list(pred_test.columns.values)
pred_test = pred_test.values 

In [None]:
# train the model with 500 trees, 4 parallel processes, and 10 min samples to split a node 
num_trees = 500
rf = RandomForestClassifier(n_estimators=num_trees, n_jobs=4, min_samples_split=10, verbose=2, oob_score = True)
rf.fit(X=pred_train, y=label_train)

# generate predictions and add them to 'results'
rf_predictions = rf.predict_proba(pred_test)[:, 1]
results['preds'] = rf_predictions

# get AUC score (produce probabilistic predictions)
print roc_auc_score(label_test, rf_predictions)

# get accuracy (predict the class)
rf_predictions_class = rf.predict(pred_test)
print accuracy_score(label_test, rf_predictions_class, normalize=True)

# Questions:
# What happens to AUC if I change the target variable to found.gun_True?
# What happens to AUC and accuracy if I forgot to take out found.weapon_True from the features?  Why?
# How does AUC change if I forgot to remove 'arrested_True'?  Does that mean I should remove it?
# What are the most five important features?

feature_importances = list(rf.feature_importances_)
feature_list = []
for i in range(0,len(feature_names)):
    feature_list.append((feature_names[i], feature_importances[i]))
print sorted(feature_list, reverse=True, key=lambda x: x[1])



In [None]:
# Plotting question:
# Make a recovery plot: if you used the RF model to rank stops by model-predicted likelihood of weapon recovery, 
# from highest to lowest, what percent of weapons would you recover if you made the best x percent of stops?
# The plot should have percent of stops on the x axis and percent weapons recovered on the y axis





# HINTS:
# 1) order results by column 'preds'
results = results.sort(['preds'], ascending=False)

# 2) add a column to results which is the cumulative sum of found.weapon_True
plot_data = results[['found.weapon_True', 'preds']]
plot_data['weap_sum'] = plot_data['found.weapon_True'].cumsum()



# 3) use the above cumulative sum to make a column which shows percent weapons recovered
plot_data['weap_perc'] = 100*plot_data['weap_sum']/plot_data['found.weapon_True'].sum()


# 4) add a column which counts the stops
s = [j for j in range(1,296522)]
plot_data['nstop'] = s

# 5) use the above stop count column to make a column which shows percent of all stops
plot_data['stop_perc'] = 100*plot_data['nstop']/plot_data.shape[0]



# 6) restrict to just the columns from 3) and 5), downsample to maybe 1000 rows
plot_data = plot_data[['stop_perc', 'weap_perc']]
rows = random.sample(plot_data.index, 1000)
plot_data = plot_data.ix[rows]



# 7) sort everything in ascending order by the column from 5), then plot.
plot_data = plot_data.sort(['stop_perc'], ascending=True)

plt.figure()
plot_data.plot(x='stop_perc', y='weap_perc')



