In [0]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pyproj as pp
import seaborn as sns
import scipy as sp
import datetime
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
%matplotlib inline

ModuleNotFoundError: ignored

## Cleaning the Dataset

In [0]:
clean = pd.read_csv('nycrime.csv')

# Cleaning year to set up for datetime format
clean['CMPLNT_FR_DT'] = clean['CMPLNT_FR_DT'].str.slice_replace(start = -4, stop = -3, repl = "2")
clean['CMPLNT_TO_DT'] = clean['CMPLNT_TO_DT'].str.slice_replace(start = -4, stop = -3, repl = "2")

clean['CMPLNT_FR_DT'] = clean['CMPLNT_FR_DT'].str.slice_replace(start = -3, stop = -2, repl = "0")
clean['CMPLNT_TO_DT'] = clean['CMPLNT_TO_DT'].str.slice_replace(start = -3, stop = -2, repl = "0")

if (clean['CMPLNT_FR_DT'].str.slice(-2, -1).all() != "1"):
    clean['CMPLNT_FR_DT'] = clean['CMPLNT_FR_DT'].str.slice_replace(start = -2, stop = -1, repl = "0")
if (clean['CMPLNT_TO_DT'].str.slice(-2, -1).all() != "1"):
    clean['CMPLNT_TO_DT'] = clean['CMPLNT_TO_DT'].str.slice_replace(start = -2, stop = -1, repl = "0")

In [0]:
# Convert to datetime format and combine time and date
clean['CMPLNT_FR_DTM'] = pd.to_datetime(clean['CMPLNT_FR_DT'] + " " + clean['CMPLNT_FR_TM'], infer_datetime_format = True, errors = 'coerce')
clean['CMPLNT_TO_DTM'] = pd.to_datetime(clean['CMPLNT_TO_DT'] + " " + clean['CMPLNT_TO_TM'], infer_datetime_format = True, errors = 'coerce')

bad_columns = ["RPT_DT", "CMPLNT_FR_DT", "CMPLNT_FR_TM", "CMPLNT_TO_DT", "CMPLNT_TO_TM", "PD_CD", "PD_DESC", "LAW_CAT_CD", "LOC_OF_OCCUR_DESC", "JURIS_DESC", "JURISDICTION_CODE", "PARKS_NM", "HADEVELOPT", "HOUSING_PSA", "X_COORD_CD", "Y_COORD_CD", "SUSP_RACE", "TRANSIT_DISTRICT", "SUSP_SEX", "Lat_Lon", "PATROL_BORO", "STATION_NAME", "VIC_AGE_GROUP", "VIC_RACE", "VIC_SEX"]
clean = clean.drop(columns = bad_columns)

clean.to_csv('clean_nycrime.csv', index = False)

## Question A
### What is the age group that committed the most burglaries?

In [0]:
# Import cleaned dataset
dataA = pd.read_csv('clean_nycrime.csv')
# Inspect data
dataA.head()

In [0]:
# Inspect values of suspect age group
dataA['SUSP_AGE_GROUP'].unique()

In [0]:
# Gather unwanted ages into a list
badAges = ['UNKNOWN', '937', '-979', '-975', '946', '972', '-948', '-978', '926', '-972',
             '-976', '-969', '-962', '-935', '-1', '-963', '-974', '945', '933', '931',
             '-973', '-970', '936', '-971', '2015', '-65', '999', '-49', '1014', '935',
             '966', '-2', '2016', '-965', '1932', '-54', '923', '-67', '327', '-83',
             '1016', '-955', '2017', '-981', '1017', '949', '-953', '944', '-42', '-968',
             '-960', '1933', '927', '942', '940', '-72', '934', '1053', '-941', '2018',
             '1018', '-80', '955', '948', '924', '-939', '922', '1012', '-63', '928',
             '952', '920', '938', '954', '1967', '810', '915', '711', '914', '324',
             '925', '808', '809', '309', '814', '-985', '-980', '1007', '708', '709']
# Replace unwanted ages will null value
dataA['SUSP_AGE_GROUP'] = dataA['SUSP_AGE_GROUP'].replace(badAges, np.nan)
# Drop rows with a null value in suspect age column
dataA.dropna(subset = ['SUSP_AGE_GROUP'], axis = 0, inplace = True)
# Inspect final values of suspect age group
dataA['SUSP_AGE_GROUP'].unique()

In [0]:
# Inpect shape of all burglaries
dataA[dataA['OFNS_DESC'] == "BURGLARY"].shape

In [0]:
# Create list of all columns exept offense description and suspect age
col = ["CMPLNT_NUM", "ADDR_PCT_CD", "KY_CD", "CRM_ATPT_CPTD_CD", "BORO_NM",
       "PREM_TYP_DESC", "Latitude", "Longitude", "CMPLNT_FR_DTM", "CMPLNT_TO_DTM"]
# Drop columns
dataA.drop(columns = col, inplace = True)

In [0]:
# Filter offense description to contain burglaries only
burglary = dataA['OFNS_DESC'] == "BURGLARY"
burgAge = dataA.where(burglary).dropna()
# Get count of each value in suspect age group
pd.value_counts(burgAge['SUSP_AGE_GROUP'].values.flatten()).head()

## Question B
### Is there any correlation to the time of the offense and the type of offense that was committed?

In [0]:
dataB = pd.read_csv('clean_nycrime.csv')

#remove all rows where FROM time is missing
dataB = dataB[dataB['CMPLNT_FR_DTM'].notnull()]

#remove all rows where window of time of offense is greater than three hours
dataB.loc[:, 'three_hours_after_start'] = dataB.loc[:, 'CMPLNT_FR_DTM'].apply(lambda t: (datetime.strptime(t, "%Y-%m-%d %H:%M:%S") + timedelta(hours=3)).strftime("%Y-%m-%d %H:%M:%S"))
dataB = dataB[(dataB['CMPLNT_TO_DTM'].isnull()) | (dataB['CMPLNT_TO_DTM'] < dataB['three_hours_after_start'])]

#create new time column from FROM time, represented as HH.MM so that matplotlib can interpret it
dataB['time'] = dataB['CMPLNT_FR_DTM'].apply(lambda t: (datetime.strptime(t, "%Y-%m-%d %H:%M:%S")).hour + .01*(datetime.strptime(t, "%Y-%m-%d %H:%M:%S")).minute)

#combine similar offense descriptions
dataB['OFNS_DESC'] = dataB['OFNS_DESC'].apply(lambda x: 'ADMINISTRATIVE CODE' if (x == 'ADMINISTRATIVE CODES') else x)
dataB['OFNS_DESC'] = dataB['OFNS_DESC'].apply(lambda x: 'INTOXICATED/IMPAIRED DRIVING' if (x == 'INTOXICATED & IMPAIRED DRIVING') else x)
dataB['OFNS_DESC'] = dataB['OFNS_DESC'].apply(lambda x: 'KIDNAPPING AND RELATED OFFENSES' if (x == 'KIDNAPPING & RELATED OFFENSES') else x)
dataB['OFNS_DESC'] = dataB['OFNS_DESC'].apply(lambda x: 'KIDNAPPING AND RELATED OFFENSES' if (x == 'KIDNAPPING') else x)
dataB['OFNS_DESC'] = dataB['OFNS_DESC'].apply(lambda x: 'VEHICLE AND TRAFFIC LAWS' if (x == 'OTHER TRAFFIC INFRACTION') else x)
#remove "misc" categories that aren't very meaningful
dataB = dataB[dataB['OFNS_DESC'] != 'NYS LAWS-UNCLASSIFIED FELONY']
dataB = dataB[dataB['OFNS_DESC'] != 'NYS LAWS-UNCLASSIFIED VIOLATION']
dataB = dataB[dataB['OFNS_DESC'] != 'OTHER STATE LAWS']
dataB = dataB[dataB['OFNS_DESC'] != 'OTHER STATE LAWS (NON PENAL LA']
dataB = dataB[dataB['OFNS_DESC'] != 'OTHER STATE LAWS (NON PENAL LAW)']

#create and display a list of remaining categories of offenses
categories = dataB['OFNS_DESC'].drop_duplicates().dropna().values.tolist()
categories.sort()
for x in categories:
    print(x)

In [0]:
def plotB(category): #this plots a single histogram for a given category
    x = dataB[dataB['OFNS_DESC'] == category]
    times = x[['time']]
    plt.figure()
    plt.hist(times.values, bins = range(0, 25))
    #set xticks to hours
    plt.xticks(range(0, 25))
    plt.title(category)

for category in categories: #plot for all categories
    plotB(category)

## Question C
### Is there any noted correlation to the type of offense and the location of the offense?

In [0]:
dataC = pd.read_csv('clean_nycrime.csv')
irrCol = ["CMPLNT_NUM", "KY_CD", "CRM_ATPT_CPTD_CD", "PREM_TYP_DESC", "SUSP_AGE_GROUP", "CMPLNT_FR_DTM", "CMPLNT_TO_DTM"]
dataC.drop(columns = irrCol, inplace = True)
dataC.dropna(subset = ['Latitude'], axis = 0, inplace = True)
dataC.dropna(subset = ['Longitude'], axis = 0, inplace = True)
dataC.dropna(subset = ['OFNS_DESC'], axis = 0, inplace = True)

In [0]:
ny_st = pp.Proj(init = "EPSG:2263", preserve_units = True)
xx = dataC['Longitude']
yy = dataC['Latitude']
xx = np.asarray(xx)
yy = np.asarray(yy)

In [0]:
conv = ny_st(xx, yy)
lon = conv[0]
lat = conv[1]
dataC['SPAT_LON'] = pd.Series(lon)
dataC['SPAT_LAT'] = pd.Series(lat)

In [0]:
dataC.sort_values(['OFNS_DESC'], ascending = True, inplace = True)
dataC.dropna(subset = ['SPAT_LON'], axis = 0, inplace = True)
dataC.dropna(subset = ['SPAT_LAT'], axis = 0, inplace = True)

In [0]:
dataC['SPAT_LON'].isnull().sum().sum()
lonLat = ["Latitude", "Longitude"]
dataC.drop(columns = lonLat, inplace = True)

In [0]:
# RUN R CODE FIRST TO GET THIS FILE!!!
shape = pd.read_csv("boro_shape.csv")

In [0]:
lon_max = shape['long'].max()
lon_min = shape['long'].min()
lat_max = shape['lat'].max()
lat_min = shape['lat'].min()

lessLon = dataC['SPAT_LON'] > lon_min
greaterLon = dataC['SPAT_LON'] < lon_max
lessLat = dataC['SPAT_LAT'] > lat_min
greaterLat = dataC['SPAT_LAT'] < lat_max
new_dataC = dataC.where(lessLon & greaterLon & lessLat & greaterLat).dropna()

In [0]:
freq_crimes = new_data['OFNS_DESC'].value_counts()[:10].index.tolist()
print(freq_crimes)

In [0]:
pl = new_dataC['OFNS_DESC'] == freq_crimes[0]
h2 = new_dataC['OFNS_DESC'] == freq_crimes[1]
a3 = new_dataC['OFNS_DESC'] == freq_crimes[2]
cm = new_dataC['OFNS_DESC'] == freq_crimes[3]
gl = new_dataC['OFNS_DESC'] == freq_crimes[4]
dd = new_dataC['OFNS_DESC'] == freq_crimes[5]
po = new_dataC['OFNS_DESC'] == freq_crimes[6]
fa = new_dataC['OFNS_DESC'] == freq_crimes[7]
rob = new_dataC['OFNS_DESC'] == freq_crimes[8]
burg = new_dataC['OFNS_DESC'] == freq_crimes[9]
final_dataC = new_dataC.where(pl | h2 | a3 | cm | gl | dd | po | fa | rob | burg).dropna()

In [0]:
final_dataC['OFNS_DESC'].unique()
final_dataC.to_csv("nycrimemap.csv", index = False)

### The code for plotting with exported csv is continued in crimemap.R

## Question D
### Were there any rises of specific crimes on certain holidays?

In [0]:
data = pd.read_csv('clean_nycrime.csv')

In [0]:
from datetime import datetime, timedelta

In [0]:
PETIT_LARCENY_KEY = 341
HARASSMENT_2_KEY = 578
ASSAULT_3_KEY = 344
CRIMINAL_MISCHIEF_KEY = 351
GRAND_LARCENY_KEY = 109

In [0]:
data = pd.read_csv('clean_nycrime.csv', low_memory=False)
data.head()

In [0]:
data.groupby('OFNS_DESC').count().sort_values(by=['CMPLNT_NUM'], ascending=False).head(5)

In [0]:
def normalize_dates(x):
    y = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    new_date = y.replace(year=2016, hour=12, minute=0, second=0)
    return new_date

# Filter data around holiday date += 10 days across all years
data['normalized_date'] = data[~data['CMPLNT_FR_DTM'].isna()]['CMPLNT_FR_DTM'].apply(normalize_dates)

In [0]:
ten_days = pd.Timedelta(days=10)
christmas_16  = pd.Timestamp(2016, 12, 25)
christmas_15 = pd.Timestamp(2015, 12, 25)

july_fourth = pd.Timestamp(2016, 8, 4)

# around_christmas = data[(data['normalized_date'] < (christmas + ten_days)) & (data['normalized_date'] < (christmas - ten_days))]
data['is_near_christmas'] = data[~data['normalized_date'].isna()]['normalized_date'].apply(lambda x: abs(x - christmas_16) <= ten_days or abs(x - christmas_15) <= ten_days)


In [0]:
new_years_16 = pd.Timestamp(2016, 1, 1)
new_years_17 = pd.Timestamp(2017, 1, 1)

data['is_near_newyears'] = data[~data['normalized_date'].isna()]['normalized_date'].apply(lambda x: abs(x - new_years_16) <= ten_days or abs(x - new_years_17) <= ten_days)

In [0]:
july_fourth = pd.Timestamp(2016, 7, 4)

data['is_near_july_fourth'] = data[~data['normalized_date'].isna()]['normalized_date'].apply(lambda x: abs(x - july_fourth) <= ten_days)

In [0]:
thanksgiving = pd.Timestamp(2016, 11, 28)

data['is_near_thanksgiving'] = data[~data['normalized_date'].isna()]['normalized_date'].apply(lambda x: abs(x - thanksgiving) <= ten_days)

In [0]:
# Petit Larceny, Harrassment, Assault, Criminal Mischief, Grand Larceny around Christmas

data[(data['is_near_christmas'] == True)].groupby('normalized_date').head(5)


petit_larceny = data[(data['KY_CD'] == PETIT_LARCENY_KEY) & (data['is_near_christmas'] == True)].groupby('normalized_date').nunique()
harassment = data[(data['KY_CD'] == HARASSMENT_2_KEY) & (data['is_near_christmas'] == True)].groupby('normalized_date').nunique()
assault = data[(data['KY_CD'] == ASSAULT_3_KEY) & (data['is_near_christmas'] == True)].groupby('normalized_date').nunique()
criminal_mischief = data[(data['KY_CD'] == CRIMINAL_MISCHIEF_KEY) & (data['is_near_christmas'] == True)].groupby('normalized_date').nunique()
grand_larceny = data[(data['KY_CD'] == GRAND_LARCENY_KEY) & (data['is_near_christmas'] == True)].groupby('normalized_date').nunique()

# plt.plot('normalized_date', 'y', data=petit_larceny)
as_list = petit_larceny.index.values.tolist()
as_list[0] += 31574926 * 1000000000 + 38926000000000
as_list[1] += 31574926 * 1000000000 + 38926000000000
as_list[2] += 31574926 * 1000000000 + 38926000000000

def epoch_to_dt(dt):
    d = dt/1000000000
    dt_obj = datetime.fromtimestamp(d)
    return dt_obj

def dt_obj_to_date_string(dt_obj):
    return "{}-{}".format(dt_obj.month, dt_obj.day)

as_list2 = [epoch_to_dt(x) for x in as_list]

petit_larceny.index = as_list2
harassment.index = as_list2
assault.index = as_list2
criminal_mischief.index = as_list2
grand_larceny.index = as_list2


petit_larceny = petit_larceny.sort_index()
harassment = harassment.sort_index()
assault = assault.sort_index()
criminal_mischief = criminal_mischief.sort_index()
grand_larceny = grand_larceny.sort_index()

petit_larceny.index = [dt_obj_to_date_string(x) for x in petit_larceny.index]
harassment.index = [dt_obj_to_date_string(x) for x in harassment.index]
assault.index = [dt_obj_to_date_string(x) for x in assault.index]
criminal_mischief.index = [dt_obj_to_date_string(x) for x in criminal_mischief.index]
grand_larceny.index = [dt_obj_to_date_string(x) for x in grand_larceny.index]

In [0]:
plt.plot(petit_larceny.index.values, petit_larceny['CMPLNT_NUM'], label='Petit Larceny')
plt.plot(harassment.index.values, harassment['CMPLNT_NUM'], label='Harassment')
plt.plot(assault.index.values, assault['CMPLNT_NUM'], label='Assault')
plt.plot(criminal_mischief.index.values, criminal_mischief['CMPLNT_NUM'], label='Criminal Mischief')
plt.plot(grand_larceny.index.values, grand_larceny['CMPLNT_NUM'], label='Grand Larceny')

plt.xticks(rotation=45)
plt.legend()
plt.xlabel('Dates of Crime Committed')
plt.ylabel('Number of Instances of Crime Reported')
plt.title('Various Crimes Committed Around Christmas and New Years Across All Years')

petit_larceny.index.values

In [0]:
# Petit Larceny, Harrassment, Assault, Criminal Mischief, Grand Larceny around Independence Day
data[(data['is_near_july_fourth'] == True)].groupby('normalized_date').head(5)


petit_larceny = data[(data['KY_CD'] == PETIT_LARCENY_KEY) & (data['is_near_july_fourth'] == True)].groupby('normalized_date').nunique()
harassment = data[(data['KY_CD'] == HARASSMENT_2_KEY) & (data['is_near_july_fourth'] == True)].groupby('normalized_date').nunique()
assault = data[(data['KY_CD'] == ASSAULT_3_KEY) & (data['is_near_july_fourth'] == True)].groupby('normalized_date').nunique()
criminal_mischief = data[(data['KY_CD'] == CRIMINAL_MISCHIEF_KEY) & (data['is_near_july_fourth'] == True)].groupby('normalized_date').nunique()
grand_larceny = data[(data['KY_CD'] == GRAND_LARCENY_KEY) & (data['is_near_july_fourth'] == True)].groupby('normalized_date').nunique()

petit_larceny.index = [dt_obj_to_date_string(x) for x in petit_larceny.index]
harassment.index = [dt_obj_to_date_string(x) for x in harassment.index]
assault.index = [dt_obj_to_date_string(x) for x in assault.index]
criminal_mischief.index = [dt_obj_to_date_string(x) for x in criminal_mischief.index]
grand_larceny.index = [dt_obj_to_date_string(x) for x in grand_larceny.index]

In [0]:
plt.plot(petit_larceny.index.values, petit_larceny['CMPLNT_NUM'], label='Petit Larceny')
plt.plot(harassment.index.values, harassment['CMPLNT_NUM'], label='Harassment')
plt.plot(assault.index.values, assault['CMPLNT_NUM'], label='Assault')
plt.plot(criminal_mischief.index.values, criminal_mischief['CMPLNT_NUM'], label='Criminal Mischief')
plt.plot(grand_larceny.index.values, grand_larceny['CMPLNT_NUM'], label='Grand Larceny')

plt.xticks(rotation=45)
plt.legend()
plt.xlabel('Dates of Crime Committed')
plt.ylabel('Number of Instances of Crime Reported')
plt.title('Various Crimes Committed Around Independence Day Across All Years')

In [0]:
# Petit Larceny, Harrassment, Assault, Criminal Mischief, Grand Larceny around Thanksgiving

data[(data['is_near_thanksgiving'] == True)].groupby('normalized_date').head(5)


petit_larceny = data[(data['KY_CD'] == PETIT_LARCENY_KEY) & (data['is_near_thanksgiving'] == True)].groupby('normalized_date').nunique()
harassment = data[(data['KY_CD'] == HARASSMENT_2_KEY) & (data['is_near_thanksgiving'] == True)].groupby('normalized_date').nunique()
assault = data[(data['KY_CD'] == ASSAULT_3_KEY) & (data['is_near_thanksgiving'] == True)].groupby('normalized_date').nunique()
criminal_mischief = data[(data['KY_CD'] == CRIMINAL_MISCHIEF_KEY) & (data['is_near_thanksgiving'] == True)].groupby('normalized_date').nunique()
grand_larceny = data[(data['KY_CD'] == GRAND_LARCENY_KEY) & (data['is_near_thanksgiving'] == True)].groupby('normalized_date').nunique()

petit_larceny.index = [dt_obj_to_date_string(x) for x in petit_larceny.index]
harassment.index = [dt_obj_to_date_string(x) for x in harassment.index]
assault.index = [dt_obj_to_date_string(x) for x in assault.index]
criminal_mischief.index = [dt_obj_to_date_string(x) for x in criminal_mischief.index]
grand_larceny.index = [dt_obj_to_date_string(x) for x in grand_larceny.index]

In [0]:
plt.plot(petit_larceny.index.values, petit_larceny['CMPLNT_NUM'], label='Petit Larceny')
plt.plot(harassment.index.values, harassment['CMPLNT_NUM'], label='Harassment')
plt.plot(assault.index.values, assault['CMPLNT_NUM'], label='Assault')
plt.plot(criminal_mischief.index.values, criminal_mischief['CMPLNT_NUM'], label='Criminal Mischief')
plt.plot(grand_larceny.index.values, grand_larceny['CMPLNT_NUM'], label='Grand Larceny')

plt.xticks(rotation=45)
plt.legend(loc='upper left')
plt.xlabel('Dates of Crime Committed')
plt.ylabel('Number of Instances of Crime Reported')
plt.title('Various Crimes Committed Around Thanksgiving Across All Years')

## Question E
### Are there any specific date ranges or time ranges where crime is more prevalent in one borough than the other?

In [0]:
dataE=pd.read_csv('clean_nycrime.csv')
dataE.head()

In [0]:
#filter columns down to the ones we need for this question
boroughs = dataE[['BORO_NM','OFNS_DESC','CMPLNT_FR_DTM','CMPLNT_TO_DTM']]
boroughs.head()

In [0]:
boroughs_group = boroughs.groupby('BORO_NM')
boroughs_group.size()

In [0]:
bgroup2 = boroughs.groupby(['BORO_NM', 'CMPLNT_FR_DTM']).size()
bgroup2 = bgroup2.unstack()
plt.plot(bgroup2)

## Question F
### What crimes are easiest to get away with?

In [0]:
dataF = pd.read_csv('clean_nycrime.csv')

In [0]:
dataF['SUSP_AGE_GROUP'].unique()

In [0]:
age_groups = ['<18', '18-24', '25-44', '45-64', '65+']
significant_crimes = ['PETIT LARCENY', 'HARRASSMENT 2', 'GRAND LARCENY', 'CRIMINAL MISCHIEF & RELATED OF', 'ASSAULT 3 & RELATED OFFENSES', 'ROBBERY', 'OFF. AGNST PUB ORD SENSBLTY &', 'FELONY ASSAULT', 'BURGLARY']
pd.set_option('display.max_row', 100)



has_suspect_info = dataF[(dataF['SUSP_AGE_GROUP'].isin(age_groups)) & (dataF['OFNS_DESC'].isin(significant_crimes))]
has_no_suspect_info = dataF[(~dataF['SUSP_AGE_GROUP'].isin(age_groups)) & (dataF['OFNS_DESC'].isin(significant_crimes))]
x = has_no_suspect_info.groupby('OFNS_DESC')[['SUSP_AGE_GROUP']].count()
y = has_suspect_info.groupby('OFNS_DESC')[['SUSP_AGE_GROUP']].count()

x.combine(y, lambda has, has_not: (has_not/(has+has_not))*100).sort_values(by='SUSP_AGE_GROUP', ascending=False)

In [0]:
x.sort_values(by='SUSP_AGE_GROUP', ascending=False)

In [0]:
completed = dataF[(dataF['CRM_ATPT_CPTD_CD'] == 'COMPLETED') & (dataF['OFNS_DESC'].isin(significant_crimes))]
attempted = dataF[(dataF['CRM_ATPT_CPTD_CD'] == 'ATTEMPTED') & (dataF['OFNS_DESC'].isin(significant_crimes))]
# nan_crime = data[data['CRM_ATPT_CPTD_CD'].isna()]

x = completed.groupby('OFNS_DESC')[['CRM_ATPT_CPTD_CD']].count()
y = attempted.groupby('OFNS_DESC')[['CRM_ATPT_CPTD_CD']].count()

x.combine(y, lambda complete, attempted: (complete/(complete+attempted))*100).sort_values(by='CRM_ATPT_CPTD_CD')

## Question G
### What crime will be most likely to be committed on a given day in the future?

In [0]:
ds = pd.read_csv('clean_nycrime.csv')
good_entries = ds[['CMPLNT_TO_DTM', 'CMPLNT_FR_DTM']].notna().any(axis='columns')

In [0]:
start_dtm = datetime(year = 2006, month = 1, day = 1, hour = 0, minute = 0)
td_group = timedelta(days = 1)

In [0]:
new_ds = ds.loc[good_entries]
new_ds.loc[:, 'CMPLNT_DTM'] = new_ds.loc[:, 'CMPLNT_FR_DTM']
new_ds.loc[:, 'CMPLNT_DTM'] = new_ds.loc[:, 'CMPLNT_DTM'].mask(new_ds.loc[:, 'CMPLNT_DTM'].isna(), new_ds.loc[:, 'CMPLNT_TO_DTM'])
#new_ds.loc[:, 'CMPLNT_DTM'].isna().value_counts()
new_ds.loc[:, 'CMPLNT_TD'] = new_ds.loc[:, 'CMPLNT_DTM'].apply(lambda x: int((datetime.fromisoformat(x) - start_dtm) / td_group))

In [0]:
cleanish_ds = new_ds[new_ds['CMPLNT_TD'] >= 0]
clean_ds = cleanish_ds[cleanish_ds['CMPLNT_TD'] <= 4150]

In [0]:
desired_crime_codes = clean_ds['KY_CD'].value_counts().head(10).keys().values
refined_ds = clean_ds[clean_ds['KY_CD'].isin(desired_crime_codes)][['KY_CD', 'CMPLNT_TD']]
df = pd.concat([refined_ds, pd.get_dummies(refined_ds['KY_CD'], prefix='KY_CD')], axis=1).drop(['KY_CD'], axis=1)
print(desired_crime_codes)

In [0]:
agg_df = df.groupby(['CMPLNT_TD']).sum()
crime_dict = {'KY_CD_351': 'CRIMINAL MISCHIEF & RELATED OF', 'KY_CD_341': 'PETIT LARCENY', 'KY_CD_578': 'HARRASSMENT 2', 'KY_CD_344': 'ASSAULT 3 & RELATED OFFENSES', 'KY_CD_109': 'GRAND LARCENY', 'KY_CD_361': 'OFF. AGNST PUB ORD SENSBLTY &', 'KY_CD_235': 'DANGEROUS DRUGS', 'KY_CD_105': 'ROBBERY', 'KY_CD_106': 'FELONY ASSAULT', 'KY_CD_107': 'BURGLARY'}
agg_df.rename(columns=crime_dict, inplace=True)
agg_df.head()

In [0]:
targets = agg_df.reset_index().drop(['CMPLNT_TD'], axis=1)
predictors = agg_df.reset_index()['CMPLNT_TD']
train_preds, test_preds, train_targets, test_targets = train_test_split(predictors, targets, test_size=0.1, random_state=7, shuffle=True)

In [0]:
print('Train_Preds Shape: ', train_preds.shape)
print('Train_Targets Shape: ', train_targets.shape)
print('Test_Preds Shape: ', test_preds.shape)
print('Test_Targets Shape: ', test_targets.shape)

train_preds = train_preds.to_numpy().reshape(-1, 1)
test_preds = test_preds.to_numpy().reshape(-1, 1)

In [0]:
mlp_params = [{'hidden_layer_sizes': [[100], [200], [50, 100], [20, 100, 20]], 'batch_size': [1, 8, 32]}]
mlp_gs = GridSearchCV(MLPRegressor(), mlp_params, n_jobs=-1, verbose=1)
mlp_gs.fit(train_preds, train_targets)

In [0]:
print(mlp_gs.best_params_)
mlp_gs_predictions = mlp_gs.predict(test_preds)
mlp_gs_loss_mse = mean_squared_error(mlp_gs_predictions, test_targets)
mlp_gs_loss_mae = mean_absolute_error(mlp_gs_predictions, test_targets)
print(mlp_gs_loss_mse)
print(mlp_gs_loss_mae)

In [0]:
fig, axs = plt.subplots(2)
plt.subplots_adjust(hspace = 0.7, right = 0.7)
fig.set_size_inches(10, 5, forward=True)
fig.suptitle('Multilayer Perceptron Regressor Performance')
axs[0].plot(test_preds, test_targets)
axs[0].set_title('Actual Values')
axs[0].set(xlabel='Days Since 01-01-06T00:00:00', ylabel='Number of Crime Reports')
axs[1].plot(test_preds, mlp_gs_predictions)
axs[1].set_title('Predicted Values')
axs[1].set(xlabel='Days Since 01-01-06T00:00:00', ylabel='Number of Crime Reports')
fig.legend(test_targets, loc = 'right')

In [0]:
rf = RandomForestRegressor(n_estimators=3000, random_state=7, n_jobs=-1, verbose=1)
rf.fit(train_preds, train_targets)

In [0]:
rf_predictions = rf.predict(test_preds)
rf_loss_mse = mean_squared_error(rf_predictions, test_targets)
rf_loss_mae = mean_absolute_error(rf_predictions, test_targets)
print(rf_loss_mse)
print(rf_loss_mae)

In [0]:
fig, axs = plt.subplots(2)
plt.subplots_adjust(hspace = 0.7, right = 0.7)
fig.set_size_inches(10, 5, forward=True)
fig.suptitle('Random Forest Regressor Performance')
axs[0].plot(test_preds, test_targets)
axs[0].set_title('Actual Values')
axs[0].set(xlabel='Days Since 01-01-06T00:00:00', ylabel='Number of Crime Reports')
axs[1].plot(test_preds, rf_predictions)
axs[1].set_title('Predicted Values')
axs[1].set(xlabel='Days Since 01-01-06T00:00:00', ylabel='Number of Crime Reports')
fig.legend(test_targets, loc = 'right')