In [44]:
import pandas as pd
# Read dataset
extreme_weather = pd.read_csv('crime_extreme_weather_CLEANED.csv')

extreme_weather.head()

Unnamed: 0.1,Unnamed: 0,Complaint_No,Property_Crimes,Location_Type,Zipcode,Property_No,Property_type,Property_Desc,2010 Pop,Median Household Income,...,Victim_Gender,Victim_Ethnic,DATE,EVENT_TYPE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,time_diff,weather_id
0,5810,20120701220002,Shoplifting,Indoors,28216,1.0,Alcoholic Beverage,Consumable Goods,47227,53462.0,...,,,2012-07-01,Hail,35.133333,-80.740767,35.133333,-80.740767,0 days 00:16:00,30.0
1,5811,20120701220002,Shoplifting,Indoors,28216,1.0,Alcoholic Beverage,Consumable Goods,47227,53462.0,...,,,2012-07-01,Lightning,35.34,-80.81,35.34,-80.81,0 days 00:15:00,59.0
2,5812,20120701220002,Shoplifting,Indoors,28216,1.0,Alcoholic Beverage,Consumable Goods,47227,53462.0,...,,,2012-07-01,Thunderstorm Wind,35.32,-80.8,35.18,-80.655,0 days 00:19:00,82.0
3,5816,20120701224501,Arson,Indoors,28205,1.0,Dwelling: Multi-Family,Structures,43931,61525.0,...,F,Black Other/Not Listed,2012-07-01,Hail,35.133333,-80.740767,35.133333,-80.740767,0 days 00:16:00,30.0
4,5817,20120701224501,Arson,Indoors,28205,1.0,Dwelling: Multi-Family,Structures,43931,61525.0,...,F,Black Other/Not Listed,2012-07-01,Lightning,35.34,-80.81,35.34,-80.81,0 days 00:15:00,59.0


In [45]:
extreme_weather = extreme_weather[['Property_Crimes', 'Property_type', 'EVENT_TYPE', 'Location_Type', 'Property_Desc', 'Median Household Income', '2010 Pop', 'Victim_Age', 'Victim_Gender', 'Victim_Ethnic']]
print(extreme_weather.head())
extreme_weather.dropna(inplace=True)
extreme_weather.reset_index(drop=True, inplace=True)
print(extreme_weather.shape)

  Property_Crimes           Property_type         EVENT_TYPE Location_Type  \
0     Shoplifting      Alcoholic Beverage               Hail       Indoors   
1     Shoplifting      Alcoholic Beverage          Lightning       Indoors   
2     Shoplifting      Alcoholic Beverage  Thunderstorm Wind       Indoors   
3           Arson  Dwelling: Multi-Family               Hail       Indoors   
4           Arson  Dwelling: Multi-Family          Lightning       Indoors   

      Property_Desc  Median Household Income  2010 Pop  Victim_Age  \
0  Consumable Goods                  53462.0     47227         NaN   
1  Consumable Goods                  53462.0     47227         NaN   
2  Consumable Goods                  53462.0     47227         NaN   
3        Structures                  61525.0     43931        42.0   
4        Structures                  61525.0     43931        42.0   

  Victim_Gender                                    Victim_Ethnic  
0           NaN                            

In [46]:
extreme_weather = pd.get_dummies(extreme_weather, columns=['Property_type', 'EVENT_TYPE', 'Location_Type', 'Property_Desc', 'Victim_Gender', 'Victim_Ethnic'])


In [47]:
# create training and testing sets - 80/20
from sklearn.model_selection import train_test_split
crime_train, crime_test = train_test_split(extreme_weather, test_size =0.20)
print(crime_test.shape)
print(crime_train.shape)

(497, 89)
(1986, 89)


In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

x = crime_train.drop(columns='Property_Crimes')
y = crime_train['Property_Crimes']

In [49]:
cart01 = DecisionTreeClassifier(max_leaf_nodes=10).fit(x,y) 
rf01 = RandomForestClassifier(n_estimators = 10,criterion="gini").fit(x,y)

In [50]:
prediction_cart01 = cart01.predict(x)
prediction_rf01 = rf01.predict(x)

In [51]:
print(crime_train['Property_Crimes'].count())
crime_train['Property_Crimes'].value_counts(normalize=True)

1986


Burglary/B&E                                 0.790030
Motor Vehicle Theft                          0.165156
Theft of Motor Vehicle Parts from Vehicle    0.037261
Arson                                        0.006546
Shoplifting                                  0.001007
Name: Property_Crimes, dtype: float64

In [52]:
# create a function that we can use to check how our predictions compare to the actual values
def eval_prediction(pred, actual):
    index = 0
    correct = 0
    for outcome in actual:
        if pred[index] == outcome:
            correct += 1

        index+=1
    return correct

In [53]:
#Using the evaluation function to see how many we got correct for each model 
print("CART:", eval_prediction(prediction_cart01, y))
print("Random Forest:", eval_prediction(prediction_rf01, y))

CART: 1893
Random Forest: 1985


In [54]:
print("CART:", '{0:.2f}'.format((eval_prediction(prediction_cart01, y)/len(x))*100),"%")
print("Random Forest:", '{0:.2f}'.format((eval_prediction(prediction_rf01, y)/len(x))*100),"%")

CART: 95.32 %
Random Forest: 99.95 %


In [55]:
# Now we would want to see how well our models perfom on the testing set
x_test = crime_test.drop(columns='Property_Crimes')
y_test = crime_test['Property_Crimes']


print(rf01.score(x_test,y_test))
print(cart01.score(x_test,y_test))

0.9818913480885312
0.9577464788732394


In [36]:
#After further evaluation, Our random forest model is performing significantly higher than our cart model. 
#The Random Forest model is performing significantly higher than the Cart model, but that is to be expected as the Random Forest model is much more complex.
#The Cart model on the other hand, helps us better understand whether or not our variables are improving the accuracy of our predictions because of its simplicity.



In [56]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB() 
nb_model.fit(x, y)

print(nb_model.score(x,y))
print(nb_model.score(x_test,y_test))

0.31017119838872104
0.3460764587525151


In [57]:
y_predicted = nb_model.predict(x_test)
ypred = pd.crosstab(y_test, y_predicted, rownames = ['Actual'], colnames = ['Predicted'])
ypred['Total'] = ypred.sum(axis=1) 
ypred.loc['Total'] = ypred.sum()
ypred

Predicted,Arson,Burglary/B&E,Shoplifting,Theft of Motor Vehicle Parts from Vehicle,Total
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Arson,3,0,0,0,3
Burglary/B&E,115,169,93,6,383
Motor Vehicle Theft,61,0,29,2,92
Theft of Motor Vehicle Parts from Vehicle,10,0,9,0,19
Total,189,169,131,8,497
