Section 1: Importing required functions and libraries

In [2]:
import pandas as pd
import numpy as np
import datetime
import requests
import json
import random
import sqlite3
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
import itertools

Section 2: Connecting to the dataset via API

In [3]:
api = "https://data.cityofnewyork.us/resource/qgea-i56i.json"
response_api = requests.get(api)
data = response_api.text
raw = json.loads(data)
df = pd.DataFrame.from_records(raw)
df.head(5)


Unnamed: 0,cmplnt_num,cmplnt_fr_dt,cmplnt_fr_tm,addr_pct_cd,rpt_dt,ky_cd,ofns_desc,pd_cd,pd_desc,crm_atpt_cptd_cd,...,susp_age_group,susp_race,susp_sex,cmplnt_to_dt,cmplnt_to_tm,transit_district,station_name,parks_nm,housing_psa,hadevelopt
0,506547392,2018-03-29T00:00:00.000,20:30:00,32,2018-03-30T00:00:00.000,351,CRIMINAL MISCHIEF & RELATED OF,254,"MISCHIEF, CRIMINAL 4, OF MOTOR",COMPLETED,...,,,,,,,,,,
1,629632833,2018-02-06T00:00:00.000,23:15:00,52,2018-02-07T00:00:00.000,341,PETIT LARCENY,333,"LARCENY,PETIT FROM STORE-SHOPL",COMPLETED,...,45-64,BLACK,F,,,,,,,
2,787203902,2018-11-21T00:00:00.000,00:15:00,75,2018-11-21T00:00:00.000,341,PETIT LARCENY,321,"LARCENY,PETIT FROM AUTO",COMPLETED,...,25-44,WHITE HISPANIC,F,2018-11-21T00:00:00.000,00:20:00,,,,,
3,280364018,2018-06-09T00:00:00.000,21:42:00,10,2018-06-10T00:00:00.000,361,OFF. AGNST PUB ORD SENSBLTY &,639,AGGRAVATED HARASSMENT 2,COMPLETED,...,25-44,WHITE HISPANIC,M,2018-06-09T00:00:00.000,21:43:00,,,,,
4,985800320,2018-11-10T00:00:00.000,19:40:00,19,2018-11-10T00:00:00.000,341,PETIT LARCENY,333,"LARCENY,PETIT FROM STORE-SHOPL",COMPLETED,...,<18,BLACK HISPANIC,F,2018-11-10T00:00:00.000,19:45:00,,,,,


Section 3: Counting the number of occurrences of crimes per day and cleaning up dataframe

In [33]:
counting = df.groupby(['boro_nm','cmplnt_fr_dt']).size().reset_index(name='count')
df_count = pd.merge(df,counting,how='right',on=['boro_nm','cmplnt_fr_dt'])
#df_count['sex_coded'] = pd.factorize(df.susp_sex)[0]
#df_count['age_coded'] = pd.factorize(df.susp_age_group)[0]
df_count['weekday'] = df_count['cmplnt_fr_dt'].apply(lambda x: pd.to_datetime(x).weekday())

df_count['susp_sex'] = df_count['susp_sex'].fillna('U')
df_count['susp_age_group'] = df_count['susp_age_group'].fillna('UNKNOWN')
df_count['boro_nm'] = df_count['boro_nm'].fillna('no_name')
df_count['susp_race'] = df_count['susp_race'].fillna('UNKNOWN')

0             UNKNOWN
1             UNKNOWN
2             UNKNOWN
3      WHITE HISPANIC
4             UNKNOWN
            ...      
995             BLACK
996             BLACK
997             BLACK
998           UNKNOWN
999           UNKNOWN
Name: susp_race, Length: 1000, dtype: object

Section 4: Splitting into test, train, and validation sets

In [34]:
df_range = list(range(0,len(df)))
random.shuffle(df_range)
train = df_range[0:799]
test = df_range[800:899]
validate = df_range[900:1000]
train_df = df_count.iloc[train]
test_df = df_count.iloc[test]
validate_df = df_count.iloc[test]

Section 5: FItting and displaying model summary and accuracy

In [35]:
fit = ols('count~C(susp_sex)+C(susp_age_group)+C(weekday)+C(boro_nm)+C(susp_race)',data=train_df).fit()
print(fit.summary())
predictions = round(fit.predict(test_df))#.fillna(0)
count=0
for true,predicted in zip(test_df['count'],predictions):
    if true == predicted:
        count+=1
accuracy = count/len(predictions)
accuracy

                            OLS Regression Results                            
Dep. Variable:                  count   R-squared:                       0.054
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     1.938
Date:                Wed, 30 Nov 2022   Prob (F-statistic):            0.00537
Time:                        19:44:08   Log-Likelihood:                -912.87
No. Observations:                 799   AIC:                             1874.
Df Residuals:                     775   BIC:                             1986.
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

0.29292929292929293