In [None]:
# Import Dependencies
import numpy as np
import pandas as pd
import os
import joblib

## Import and Check Chicago Crime Datasets

In [None]:
# Import 2016 - 2019 crime data
crime_2016 = os.path.join("..","Resources", "crime_clean_2016.csv") 
crime_2017 = os.path.join("..","Resources", "crime_clean_2017.csv") 
crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv")

crime_2016_df_final = pd.read_csv(crime_2016)
crime_2017_df_final = pd.read_csv(crime_2017)
crime_2018_df_final = pd.read_csv(crime_2018)

join1 = crime_2016_df_final.append(crime_2017_df_final)
training_df = join1.append(crime_2018_df_final)
training_df.head()

In [None]:
training_df['arrest'].value_counts()

In [None]:
# Import 2019 crime data and prepare for testing
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv") 

crime_2019_final_df = pd.read_csv(crime_2019)

# crime_2019_final_df.head()

In [None]:
training_df=(training_df[['month','hour','day_of_week','district','block','ward','beat','community_area',
                                           'location_description','fbi_code','primary_type','domestic',
                                           'arrest']])
training_df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
training_df['domestic'] = le.fit_transform(training_df['domestic'])
training_df['block'] = le.fit_transform(training_df['block'])
training_df['location_description'] = le.fit_transform(training_df['location_description'])
training_df['primary_type'] = le.fit_transform(training_df['primary_type'])
training_df['fbi_code'] = le.fit_transform(training_df['fbi_code'])
training_df['arrest'] = le.fit_transform(training_df['arrest'])

In [None]:
# Assign X (data) and y (target)
X = training_df.drop("arrest", axis=1)
y = training_df["arrest"]

In [None]:
X=X[['month','hour','day_of_week','location_description','primary_type']]

In [None]:
training_df.head()

In [None]:
training_df['arrest'].value_counts()

In [None]:
print('X Shape:', X.shape)
print('y Shape:', y.shape)

In [None]:
# Split data into test/train set (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [None]:
print('Training X Shape:', X_train.shape)
print('Training y Shape:', y_train.shape)
print('Testing X Shape:', X_test.shape)
print('Testing y Shape:', y_test.shape)


In [None]:
# Select Algorithm
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
feature_importance = sorted(zip(rf.feature_importances_, X), reverse=True)
feature_importance

In [None]:
# save the model to disk
filename = 'RF_model.sav'
joblib.dump(rf, filename,compress=3)

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = rf.predict(X_test)
print(classification_report(y_test, predictions))