In [1]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
import calendar
%pylab inline

import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


# Setup

In [2]:
#read data in, appended 2014/2015, removed GO from col names
crime2014 = pd.read_csv("https://raw.githubusercontent.com/sxzhu/mis381_project/master/Austin_Crime_2014.csv", dtype=unicode, encoding='utf-8')
crime2014[:3]

crime2015 = pd.read_csv("https://raw.githubusercontent.com/sxzhu/mis381_project/master/Austin_Crime_2015.csv", dtype=unicode, encoding='utf-8')
crime = crime2014.append(crime2015,ignore_index=True)

crime = crime.rename(index=str, columns={"Highest NIBRS/UCR Offense Description": "General Offense Description",\
                                 "GO Highest Offense Desc":"Detailed Offense Description",\
                                 "GO Location":"Location","GO Report Date":"Report Date",\
                                 "GO Location Zip":"Zipcode","GO District":"District","GO Census Tract":"Census Tract",\
                                 "GO X Coordinate":"X Coordinate","GO Y Coordinate":"Y Coordinate",\
                                      "GO Primary Key":"Primary Key"})

crime = crime.drop('Location_1', axis=1)
crime['Date'] = pd.DatetimeIndex(crime['Report Date'])
crime['Month'] = pd.DatetimeIndex(crime['Date']).month
crime['Month'] = crime['Month'].apply(lambda x: calendar.month_abbr[x])
crime['Year'] = pd.DatetimeIndex(crime['Report Date']).year

## Merge Crimes

In [3]:
def merge_crimes(s):
    """Map some crimes together."""
    
    # Create a map of misspellings to correct spelling
    misspellings = {'Burglary / \nBreaking & Entering':'Burglary',
                    'Auto Theft':'Theft: Auto Parts',
                    'Homicide: Murder & Nonnegligent Manslaughter':'Murder',
                    'Agg Assault':'Aggravated Assault'}
    
    if s in misspellings:
        return misspellings[s]
    else:
        return s
    
# Apply the function to the Series
crime['General Offense Description'] = crime['General Offense Description'].map(merge_crimes)

def merge_theft(s):
    """Map some crimes together."""
    
    if (s[:5]=='Theft'):
        return s[:5]
    else:
        return s
    
# Apply the function to the Series
crime['General Offense Description'] = crime['General Offense Description'].map(merge_theft)

# Check that it worked
# mask = (crime['General Offense Description'] == 'Agg Assault')
# crime['General Offense Description'][mask]  # should be empty

crime['General Offense Description'].value_counts()

Theft                 61908
Burglary              10597
Aggravated Assault     3803
Robbery                1809
Rape                   1042
Murder                   55
Name: General Offense Description, dtype: int64

# Final Dataframe

In [4]:
#Add the 'Streetname' column to both dataframes, keeping only the street name information
# removes leading numbers, leading 'BLOCK', leading hyphenated or / numbers, trailing 'NB', 'SB', 'EB', 'WB', trailing 'SVRD', and leading 'N', 'S', 'E', 'W'

crime['Streetname'] = crime['Location'].str.rsplit('\n', -1).str[0].str.replace('^[/\s\d-]+/?\d*\s+', '').str.replace('^\d+[A-Z]\s+', '').str.replace('^BLOCK\s+', '').str.replace('\s[NSEW]B\s*$', '').str.replace('\sSVRD\s*$', '').str.replace('^\s*[NSEW]\s+', '').str.replace(' HALF ST', ' ST').str.rstrip()
crime['PrimaryStreet'] = crime['Streetname'].str.rsplit(' / ', -1).str[0]
crime['SecondaryStreet'] = crime['Streetname'].str.rsplit(' / ', -1).str[1]

In [21]:
crime['target'] = 0.0
mask = (crime['Clearance Status']=='C')
crime['target'][mask] = 1.0

crime['target'].value_counts()

0.0    67939
1.0    11275
Name: target, dtype: int64

In [22]:
df = crime[crime['Zipcode']!=1]

In [23]:
categorical_columns = ['Council District', 'District', 'Detailed Offense Description', 'Zipcode', 
                       'General Offense Description', 'Month']
df_dummies = pd.get_dummies(df[categorical_columns],
                            prefix=categorical_columns,
                            columns=categorical_columns)
dummy_column_names = df_dummies.columns.values
dummy_column_names[:10]

array([u'Council District_1', u'Council District_10',
       u'Council District_2', u'Council District_3', u'Council District_4',
       u'Council District_5', u'Council District_6', u'Council District_7',
       u'Council District_8', u'Council District_9'], dtype=object)

In [24]:
df2 = pd.concat([df, df_dummies], axis=1)
df2.head()

Unnamed: 0,Clearance Date,Clearance Status,Council District,Census Tract,District,Detailed Offense Description,Location,Zipcode,Primary Key,Report Date,...,Month_Dec,Month_Feb,Month_Jan,Month_Jul,Month_Jun,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep
0,04/28/2014 12:00:00 AM,N,1,18.35,E,AGG ROBBERY/DEADLY WEAPON,12151 N IH 35 SVRD NB ...,78753,20141061920,04/17/2014 12:00:00 AM,...,0,0,0,0,0,0,0,0,0,0
1,05/20/2014 12:00:00 AM,N,1,21.13,I,ROBBERY BY ASSAULT,3300 BLOCK ROCKHURST LN ...,78723,20141150937,04/25/2014 12:00:00 AM,...,0,0,0,0,0,0,0,0,0,0
2,05/13/2014 12:00:00 AM,N,3,9.02,C,ROBBERY BY THREAT,E 7TH ST / CHICON ST ...,78702,20141310316,05/11/2014 12:00:00 AM,...,0,0,0,0,0,0,1,0,0,0
3,03/24/2015 12:00:00 AM,C,1,21.04,I,AGG ROBBERY/DEADLY WEAPON,WHELESS LN / BERKMAN DR ...,78723,20141670098,06/16/2014 12:00:00 AM,...,0,0,0,0,1,0,0,0,0,0
4,10/02/2014 12:00:00 AM,N,3,9.02,G,AGG ROBBERY/DEADLY WEAPON,WALLER ST / E 2ND ST ...,78702,20142070292,07/26/2014 12:00:00 AM,...,0,0,0,1,0,0,0,0,0,0


In [25]:
formula = 'target ~ 0 + {}'.format(' + '.join(['Q("{}")'.format(x) for x in dummy_column_names]))
print formula

target ~ 0 + Q("Council District_1") + Q("Council District_10") + Q("Council District_2") + Q("Council District_3") + Q("Council District_4") + Q("Council District_5") + Q("Council District_6") + Q("Council District_7") + Q("Council District_8") + Q("Council District_9") + Q("District_A") + Q("District_AP") + Q("District_B") + Q("District_C") + Q("District_D") + Q("District_E") + Q("District_F") + Q("District_G") + Q("District_H") + Q("District_I") + Q("District_UK") + Q("Detailed Offense Description_AGG ASLT ENHANC STRANGL/SUFFOC") + Q("Detailed Offense Description_AGG ASLT STRANGLE/SUFFOCATE   ") + Q("Detailed Offense Description_AGG ASLT W/MOTOR VEH FAM/DAT V") + Q("Detailed Offense Description_AGG ASSAULT                   ") + Q("Detailed Offense Description_AGG ASSAULT FAM/DATE VIOLENCE ") + Q("Detailed Offense Description_AGG ASSAULT ON PUBLIC SERVANT ") + Q("Detailed Offense Description_AGG ASSAULT WITH MOTOR VEH    ") + Q("Detailed Offense Description_AGG RAPE                 

In [26]:
Y, X = dmatrices(formula, df2, return_type='dataframe')

In [27]:
y = Y['target'].values
y[:10]

array([ 0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.])

In [28]:
from sklearn import naive_bayes
model = naive_bayes.MultinomialNB()

model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
from sklearn import metrics

prediction_train = model.predict(X)
print metrics.accuracy_score(y, prediction_train)

0.865680309036


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [31]:
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
prediction_test = model.predict(X_test)
print metrics.accuracy_score(y_test, prediction_test)

0.868041237113


In [33]:
print 'Prior probability for the negative class is',
print exp(model.class_log_prior_[0])
print 'Prior probability for the positive class is',
print exp(model.class_log_prior_[1])

Prior probability for the negative class is 0.856318418727
Prior probability for the positive class is 0.143681581273


In [35]:
df2['target'].value_counts() / len(df2)

0.0    0.857664
1.0    0.142336
Name: target, dtype: float64

In [36]:
feature_importances = abs(model.feature_log_prob_[1] - model.feature_log_prob_[0])
feature_importances

array([  3.97619538e-01,   9.20195184e-01,   3.28277289e-01,
         3.08383034e-01,   2.93077854e-01,   2.72844944e-02,
         1.79349089e-01,   6.01920400e-02,   1.64406598e-01,
         2.65410891e-01,   1.36655364e-01,   9.30608850e-01,
         3.74581924e-01,   2.07141005e-01,   1.71007561e-01,
         4.74289139e-02,   7.28887688e-04,   1.77364435e-01,
         9.50937811e-02,   3.03976458e-01,   1.10007717e-01,
         2.59759345e+00,   2.43167738e+00,   8.14972425e-01,
         7.19035482e-01,   2.40627513e+00,   2.18802156e+00,
         6.46203835e-01,   8.95253256e-01,   2.13395434e+00,
         8.17475555e-01,   7.54830974e-01,   3.96262090e-01,
         3.77936506e-02,   2.36848345e+00,   1.70008126e+00,
         9.20301787e-03,   5.29587331e-01,   9.32138292e-01,
         5.54876882e-01,   1.96326908e+00,   2.88116874e+00,
         5.29793483e-01,   1.08940927e+00,   2.69884718e+00,
         4.14668126e-01,   1.95440671e+00,   2.96885090e-01,
         2.70599228e-01,

In [37]:
feature_importance_series = Series(feature_importances, index=X.columns.values)
feature_importance_series.sort_values(ascending=False)[:10]

Q("Detailed Offense Description_TAKE WEAPON FRM POLICE OFFICER")    4.180452
Q("Detailed Offense Description_THEFT FROM BUILDING           ")    3.429658
Q("Detailed Offense Description_THEFT OF AUTO PARTS           ")    3.323389
Q("Detailed Offense Description_CAPITAL MURDER                ")    2.881169
Q("Detailed Offense Description_THEFT FROM AUTO               ")    2.799857
Q("Detailed Offense Description_MANSLAUGHTER                  ")    2.698847
Q("Detailed Offense Description_AGG ASLT ENHANC STRANGL/SUFFOC")    2.597593
Q("Detailed Offense Description_AGG ASLT STRANGLE/SUFFOCATE   ")    2.431677
Q("Detailed Offense Description_AGG ASSAULT FAM/DATE VIOLENCE ")    2.406275
Q("Detailed Offense Description_BREACH OF COMPUTER SECURITY   ")    2.368483
dtype: float64

In [38]:
top_10_feature_indices = feature_importance_series.sort_values(ascending=False)[:10].index.values

inter_class_differences = model.feature_log_prob_[1] - model.feature_log_prob_[0]
new_feature_importance_series = Series(inter_class_differences, index=X.columns.values)

new_feature_importance_series[top_10_feature_indices]

Q("Detailed Offense Description_TAKE WEAPON FRM POLICE OFFICER")    4.180452
Q("Detailed Offense Description_THEFT FROM BUILDING           ")   -3.429658
Q("Detailed Offense Description_THEFT OF AUTO PARTS           ")   -3.323389
Q("Detailed Offense Description_CAPITAL MURDER                ")    2.881169
Q("Detailed Offense Description_THEFT FROM AUTO               ")   -2.799857
Q("Detailed Offense Description_MANSLAUGHTER                  ")    2.698847
Q("Detailed Offense Description_AGG ASLT ENHANC STRANGL/SUFFOC")    2.597593
Q("Detailed Offense Description_AGG ASLT STRANGLE/SUFFOCATE   ")    2.431677
Q("Detailed Offense Description_AGG ASSAULT FAM/DATE VIOLENCE ")    2.406275
Q("Detailed Offense Description_BREACH OF COMPUTER SECURITY   ")   -2.368483
dtype: float64