In [92]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.tools.tools import add_constant


import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as snsbb


from scipy.stats import t
from scipy.stats import norm




In [93]:
pre_ts_analysis = pd.read_csv('Research_Firearm/pre_ts_analysis.csv')



In [94]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'budget_by_year', 'Burglary_pct',
       'Burglary_per_capita', 'FELONY ASSAULT_per_capita',
       'FELONY ASSAULT_pct', 'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NO

In [95]:
# Create 'Violent Crime' column
pre_ts_analysis['Violent Crime'] = pre_ts_analysis['FELONY ASSAULT'] + pre_ts_analysis['MURDER & NON NEGL. MANSLAUGHTER'] + pre_ts_analysis['RAPE'] + pre_ts_analysis['ROBBERY']
# Create 'Property Crime' column
pre_ts_analysis['Property Crime'] = pre_ts_analysis['GRAND LARCENY'] + pre_ts_analysis['GRAND LARCENY OF MOTOR VEHICLE'] + pre_ts_analysis['BURGLARY']

In [96]:
# Calculate yearly totals
yearly_totals = pre_ts_analysis.groupby('Year').sum().reset_index()
# List of metrics for which want to calculate yearly totals
metrics = ['Violent Crime', 'Property Crime']
# Rename columns to *_by_year
yearly_totals = yearly_totals.rename(columns={metric: f"{metric.lower()}_by_year" for metric in metrics})
# Merge yearly totals with merged_data
merged_data = pd.merge(pre_ts_analysis, yearly_totals[['Year'] + [f"{metric.lower()}_by_year" for metric in metrics]], 
                       on='Year', how='left')

# List of metrics for which to calculate yearly totals and per capita rates
metrics = ['Violent Crime', 'Property Crime']
# Loop through each metric
for metric in metrics:
    # Compute the per capita rate
    per_capita_col_name = f"{metric}_per_capita"
    merged_data[per_capita_col_name] = merged_data[metric] / merged_data['Population_Year']    
    # Compute the percentage-based rate
    pct_col_name = f"{metric}_pct"
    yearly_col_name = f"{metric.lower()}_by_year"
    merged_data[pct_col_name] = merged_data[metric] / merged_data[yearly_col_name]


In [97]:
pre_ts_analysis = merged_data

In [98]:
pre_ts_analysis.groupby('Year')['Violent Crime_pct'].sum()

Year
2006    1.0
2007    1.0
2008    1.0
2009    1.0
2010    1.0
2011    1.0
2012    1.0
2013    1.0
2014    1.0
2015    1.0
2016    1.0
2017    1.0
2018    1.0
2019    1.0
2020    1.0
2021    1.0
2022    1.0
2023    1.0
Name: Violent Crime_pct, dtype: float64

In [99]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'budget_by_year', 'Burglary_pct',
       'Burglary_per_capita', 'FELONY ASSAULT_per_capita',
       'FELONY ASSAULT_pct', 'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NO

In [100]:
# Get all columns ending with '_pct' or '_per_capita'
cols_to_lag = [col for col in pre_ts_analysis.columns if col.endswith('_pct') or col.endswith('_per_capita')]

# Loop through the columns and create a lag-1 column for each
for col in cols_to_lag:
    lag_col_name = col + '_lag1'  # Name of the new lag column
    pre_ts_analysis[lag_col_name] = pre_ts_analysis.groupby('Precinct')[col].shift(1)

In [101]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'budget_by_year', 'Burglary_pct',
       'Burglary_per_capita', 'FELONY ASSAULT_per_capita',
       'FELONY ASSAULT_pct', 'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NO

In [102]:
# update new population data
# List of metrics for which you want to calculate yearly totals and per capita rates
metrics = ['FELONY ASSAULT', 'GRAND LARCENY', 'GRAND LARCENY OF MOTOR VEHICLE', 
           'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE', 'ROBBERY', 'TOTAL SEVEN MAJOR FELONY OFFENSES', 
           'Budget', 'Violent Crime', 'Property Crime', 'BURGLARY']

# Loop through each metric
for metric in metrics:
    # Compute the per capita rate using the updated Population_Year column
    per_capita_col_name = f"{metric}_per_capita"
    pre_ts_analysis[per_capita_col_name] = pre_ts_analysis[metric] / pre_ts_analysis['Population_Year']

# List of crimes to compute per_capita_lag1_pct for
crimes = ['Burglary', 'FELONY ASSAULT', 'GRAND LARCENY', 'GRAND LARCENY OF MOTOR VEHICLE', 
          'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE', 'ROBBERY', 'Violent Crime', 'Property Crime']

for crime in crimes:
    column_name = f"{crime}_per_capita_lag1"
    total_column_name = f"NYC_{column_name}_total"
    
    # 1. Compute total crime_per_capita_lag1 for each year across all precincts
    nyc_totals = pre_ts_analysis.groupby('Year')[column_name].sum().reset_index()
    nyc_totals = nyc_totals.rename(columns={column_name: total_column_name})
    
    # 2. Merge the total back to the pre_ts_analysis DataFrame
    pre_ts_analysis = pd.merge(pre_ts_analysis, nyc_totals, on='Year', how='left')
    
    # 3. Compute the percentage
    pct_column_name = f"{crime}_per_capita_lag1_pct"
    pre_ts_analysis[pct_column_name] = (pre_ts_analysis[column_name] / pre_ts_analysis[total_column_name]) 
    
    # Drop the NYC total column
    pre_ts_analysis = pre_ts_analysis.drop(total_column_name, axis=1)

In [103]:
# Compute Budget_per_capita_pct

# 1. Calculate the total Budget_per_capita for each year across all precincts
nyc_totals_budget = pre_ts_analysis.groupby('Year')['Budget_per_capita'].sum().reset_index()
nyc_totals_budget = nyc_totals_budget.rename(columns={'Budget_per_capita': 'budget_by_year'})

# 2. Merge this total with the main dataframe
pre_ts_analysis = pd.merge(pre_ts_analysis, nyc_totals_budget, on='Year', how='left')

# 3. Calculate the percentage using the correct column (budget_by_year_x in this case)
pre_ts_analysis['Budget_per_capita_pct'] = pre_ts_analysis['Budget_per_capita'] / pre_ts_analysis['budget_by_year_x']

# Compute Budget_per_capita_lag1_pct

# 1. Calculate the total Budget_per_capita_lag1 for each year across all precincts
nyc_totals_budget_lag1 = pre_ts_analysis.groupby('Year')['Budget_per_capita_lag1'].sum().reset_index()
nyc_totals_budget_lag1 = nyc_totals_budget_lag1.rename(columns={'Budget_per_capita_lag1': 'budget_lag1_by_year'})

# 2. Merge this total with the main dataframe
pre_ts_analysis = pd.merge(pre_ts_analysis, nyc_totals_budget_lag1, on='Year', how='left')

# 3. Calculate the percentage using the correct column (presumably budget_lag1_by_year if no naming collision occurred)
pre_ts_analysis['Budget_per_capita_lag1_pct'] = pre_ts_analysis['Budget_per_capita_lag1'] / pre_ts_analysis['budget_lag1_by_year']

# Cleanup
pre_ts_analysis.drop(columns=['budget_by_year_x', 'budget_lag1_by_year'], inplace=True)

In [104]:
pre_ts_analysis

Unnamed: 0,Year,Precinct,Full Time Positions,Budget,Borough,Citizen_2010,Poverty Rate_2010,Immigration Rates 2010,Budget per Capita,MURDER & NON NEGL. MANSLAUGHTER,...,GRAND LARCENY_per_capita_lag1_pct,GRAND LARCENY OF MOTOR VEHICLE_per_capita_lag1_pct,MURDER & NON NEGL. MANSLAUGHTER_per_capita_lag1_pct,RAPE_per_capita_lag1_pct,ROBBERY_per_capita_lag1_pct,Violent Crime_per_capita_lag1_pct,Property Crime_per_capita_lag1_pct,budget_by_year_y,Budget_per_capita_pct,Budget_per_capita_lag1_pct
0,2006,1,219,11001943,MANHATTAN SOUTH,44138,6.12%,33.16%,166.61,1,...,,,,,,,,8607.967085,2.386904e-07,
1,2006,5,240,12354423,MANHATTAN SOUTH,37193,24.59%,29.40%,234.50,2,...,,,,,,,,8607.967085,2.985505e-07,
2,2006,6,237,10716126,MANHATTAN SOUTH,50266,6.20%,19.22%,172.21,3,...,,,,,,,,8607.967085,2.255519e-07,
3,2006,7,174,7786080,MANHATTAN SOUTH,43853,29.72%,22.03%,138.44,4,...,,,,,,,,8607.967085,1.802266e-07,
4,2006,9,234,10190005,MANHATTAN SOUTH,58772,21.06%,23.12%,133.30,1,...,,,,,,,,8607.967085,1.714364e-07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308,2023,114,252,20752787,QUEENS NORTH,129728,16.62%,36.02%,99.27,8,...,0.009165,0.014314,0.004640,0.008197,0.008014,0.009503,0.010580,12141.326044,7.863216e-08,0.007523
1309,2023,115,289,16525730,QUEENS NORTH,80034,19.76%,53.35%,96.57,4,...,0.011898,0.015036,0.004113,0.014158,0.011109,0.010721,0.011712,12141.326044,7.257499e-08,0.007319
1310,2023,120,399,29273345,STATEN ISLAND,73854,19.79%,34.39%,255.77,12,...,0.006319,0.009740,0.014626,0.007950,0.006349,0.010712,0.007066,12141.326044,1.863896e-07,0.019385
1311,2023,122,249,19873478,STATEN ISLAND,103552,9.18%,25.49%,139.38,2,...,0.005648,0.007889,0.001692,0.003680,0.002769,0.003485,0.006054,12141.326044,1.082738e-07,0.010563


In [105]:
# 1. Calculate the total Budget_per_capita for each year across all precincts
nyc_totals_budget = pre_ts_analysis.groupby('Year')['Budget_per_capita'].sum().reset_index()
nyc_totals_budget = nyc_totals_budget.rename(columns={'Budget_per_capita': 'total_budget_per_capita_by_year'})

# 2. Merge this total with the main dataframe
pre_ts_analysis = pd.merge(pre_ts_analysis, nyc_totals_budget, on='Year', how='left')

# 3. Calculate the percentage
pre_ts_analysis['Budget_per_capita_pct'] = pre_ts_analysis['Budget_per_capita'] / pre_ts_analysis['total_budget_per_capita_by_year']

# Now verify if the sum for Budget_per_capita_pct by Year is close to 1.
check_6 = pre_ts_analysis.groupby('Year')['Budget_per_capita_pct'].sum()
print(check_6)

Year
2006    1.0
2007    1.0
2008    1.0
2009    1.0
2010    1.0
2011    1.0
2012    1.0
2013    1.0
2014    1.0
2015    1.0
2016    1.0
2017    1.0
2018    1.0
2019    1.0
2020    1.0
2021    1.0
2022    1.0
2023    1.0
Name: Budget_per_capita_pct, dtype: float64


In [106]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'Burglary_pct', 'Burglary_per_capita',
       'FELONY ASSAULT_per_capita', 'FELONY ASSAULT_pct',
       'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NON NEGL. MANSLAUGHT

In [107]:
pre_ts_analysis

Unnamed: 0,Year,Precinct,Full Time Positions,Budget,Borough,Citizen_2010,Poverty Rate_2010,Immigration Rates 2010,Budget per Capita,MURDER & NON NEGL. MANSLAUGHTER,...,GRAND LARCENY OF MOTOR VEHICLE_per_capita_lag1_pct,MURDER & NON NEGL. MANSLAUGHTER_per_capita_lag1_pct,RAPE_per_capita_lag1_pct,ROBBERY_per_capita_lag1_pct,Violent Crime_per_capita_lag1_pct,Property Crime_per_capita_lag1_pct,budget_by_year_y,Budget_per_capita_pct,Budget_per_capita_lag1_pct,total_budget_per_capita_by_year
0,2006,1,219,11001943,MANHATTAN SOUTH,44138,6.12%,33.16%,166.61,1,...,,,,,,,8607.967085,0.021506,,8607.967085
1,2006,5,240,12354423,MANHATTAN SOUTH,37193,24.59%,29.40%,234.50,2,...,,,,,,,8607.967085,0.026899,,8607.967085
2,2006,6,237,10716126,MANHATTAN SOUTH,50266,6.20%,19.22%,172.21,3,...,,,,,,,8607.967085,0.020322,,8607.967085
3,2006,7,174,7786080,MANHATTAN SOUTH,43853,29.72%,22.03%,138.44,4,...,,,,,,,8607.967085,0.016238,,8607.967085
4,2006,9,234,10190005,MANHATTAN SOUTH,58772,21.06%,23.12%,133.30,1,...,,,,,,,8607.967085,0.015446,,8607.967085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308,2023,114,252,20752787,QUEENS NORTH,129728,16.62%,36.02%,99.27,8,...,0.014314,0.004640,0.008197,0.008014,0.009503,0.010580,12141.326044,0.008130,0.007523,12141.326044
1309,2023,115,289,16525730,QUEENS NORTH,80034,19.76%,53.35%,96.57,4,...,0.015036,0.004113,0.014158,0.011109,0.010721,0.011712,12141.326044,0.007504,0.007319,12141.326044
1310,2023,120,399,29273345,STATEN ISLAND,73854,19.79%,34.39%,255.77,12,...,0.009740,0.014626,0.007950,0.006349,0.010712,0.007066,12141.326044,0.019272,0.019385,12141.326044
1311,2023,122,249,19873478,STATEN ISLAND,103552,9.18%,25.49%,139.38,2,...,0.007889,0.001692,0.003680,0.002769,0.003485,0.006054,12141.326044,0.011195,0.010563,12141.326044


In [108]:
violent_crime_pct_lag1 = pre_ts_analysis['MURDER & NON NEGL. MANSLAUGHTER_pct_lag1']


In [109]:
violent_crime_pct_lag1

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
          ...   
1308    0.009368
1309    0.007026
1310    0.016393
1311    0.002342
1312    0.004684
Name: MURDER & NON NEGL. MANSLAUGHTER_pct_lag1, Length: 1313, dtype: float64

In [110]:
nyc_totals_budget = pre_ts_analysis.groupby('Year')['GRAND LARCENY_pct_lag1'].sum()

In [111]:
nyc_totals_budget

Year
2006    0.000000
2007    1.000000
2008    1.000000
2009    1.000000
2010    1.000000
2011    1.000000
2012    1.000000
2013    1.000000
2014    1.021099
2015    0.978901
2016    1.000000
2017    1.000000
2018    1.000000
2019    1.000000
2020    0.986405
2021    1.013595
2022    1.000000
2023    1.000000
Name: GRAND LARCENY_pct_lag1, dtype: float64

In [112]:
# Group by Year and sum the percentage columns
check = pre_ts_analysis.groupby('Year')['Burglary_per_capita_lag1_pct'].sum()
check_2 = pre_ts_analysis.groupby('Year')['RAPE_per_capita_lag1_pct'].sum()

check_3 = pre_ts_analysis.groupby('Year')['Burglary_per_capita_lag1_pct'].sum()
check_4 = pre_ts_analysis.groupby('Year')['RAPE_per_capita_lag1_pct'].sum()

check_5 = pre_ts_analysis.groupby('Year')['Budget_per_capita_lag1_pct'].sum()
check_6 = pre_ts_analysis.groupby('Year')['Budget_per_capita_pct'].sum()


print(check)
print(check_2)

print(check_5)
print(check_6)



Year
2006    0.0
2007    1.0
2008    1.0
2009    1.0
2010    1.0
2011    1.0
2012    1.0
2013    1.0
2014    1.0
2015    1.0
2016    1.0
2017    1.0
2018    1.0
2019    1.0
2020    1.0
2021    1.0
2022    1.0
2023    1.0
Name: Burglary_per_capita_lag1_pct, dtype: float64
Year
2006    0.0
2007    1.0
2008    1.0
2009    1.0
2010    1.0
2011    1.0
2012    1.0
2013    1.0
2014    1.0
2015    1.0
2016    1.0
2017    1.0
2018    1.0
2019    1.0
2020    1.0
2021    1.0
2022    1.0
2023    1.0
Name: RAPE_per_capita_lag1_pct, dtype: float64
Year
2006    0.0
2007    1.0
2008    1.0
2009    1.0
2010    1.0
2011    1.0
2012    1.0
2013    1.0
2014    1.0
2015    1.0
2016    1.0
2017    1.0
2018    1.0
2019    1.0
2020    1.0
2021    1.0
2022    1.0
2023    1.0
Name: Budget_per_capita_lag1_pct, dtype: float64
Year
2006    1.0
2007    1.0
2008    1.0
2009    1.0
2010    1.0
2011    1.0
2012    1.0
2013    1.0
2014    1.0
2015    1.0
2016    1.0
2017    1.0
2018    1.0
2019    1.0
2020    1.0
2021 

In [113]:
pre_ts_analysis

Unnamed: 0,Year,Precinct,Full Time Positions,Budget,Borough,Citizen_2010,Poverty Rate_2010,Immigration Rates 2010,Budget per Capita,MURDER & NON NEGL. MANSLAUGHTER,...,GRAND LARCENY OF MOTOR VEHICLE_per_capita_lag1_pct,MURDER & NON NEGL. MANSLAUGHTER_per_capita_lag1_pct,RAPE_per_capita_lag1_pct,ROBBERY_per_capita_lag1_pct,Violent Crime_per_capita_lag1_pct,Property Crime_per_capita_lag1_pct,budget_by_year_y,Budget_per_capita_pct,Budget_per_capita_lag1_pct,total_budget_per_capita_by_year
0,2006,1,219,11001943,MANHATTAN SOUTH,44138,6.12%,33.16%,166.61,1,...,,,,,,,8607.967085,0.021506,,8607.967085
1,2006,5,240,12354423,MANHATTAN SOUTH,37193,24.59%,29.40%,234.50,2,...,,,,,,,8607.967085,0.026899,,8607.967085
2,2006,6,237,10716126,MANHATTAN SOUTH,50266,6.20%,19.22%,172.21,3,...,,,,,,,8607.967085,0.020322,,8607.967085
3,2006,7,174,7786080,MANHATTAN SOUTH,43853,29.72%,22.03%,138.44,4,...,,,,,,,8607.967085,0.016238,,8607.967085
4,2006,9,234,10190005,MANHATTAN SOUTH,58772,21.06%,23.12%,133.30,1,...,,,,,,,8607.967085,0.015446,,8607.967085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308,2023,114,252,20752787,QUEENS NORTH,129728,16.62%,36.02%,99.27,8,...,0.014314,0.004640,0.008197,0.008014,0.009503,0.010580,12141.326044,0.008130,0.007523,12141.326044
1309,2023,115,289,16525730,QUEENS NORTH,80034,19.76%,53.35%,96.57,4,...,0.015036,0.004113,0.014158,0.011109,0.010721,0.011712,12141.326044,0.007504,0.007319,12141.326044
1310,2023,120,399,29273345,STATEN ISLAND,73854,19.79%,34.39%,255.77,12,...,0.009740,0.014626,0.007950,0.006349,0.010712,0.007066,12141.326044,0.019272,0.019385,12141.326044
1311,2023,122,249,19873478,STATEN ISLAND,103552,9.18%,25.49%,139.38,2,...,0.007889,0.001692,0.003680,0.002769,0.003485,0.006054,12141.326044,0.011195,0.010563,12141.326044


In [114]:
# pre_ts_analysis_no_2022 = pre_ts_analysis[~pre_ts_analysis['Year'].isin([2006, 2022])]

In [115]:
import pandas as pd

# Setting pandas to display all columns
pd.set_option('display.max_columns', None)



In [116]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'Burglary_pct', 'Burglary_per_capita',
       'FELONY ASSAULT_per_capita', 'FELONY ASSAULT_pct',
       'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NON NEGL. MANSLAUGHT

In [117]:

# Resetting the option to default (usually 20)
pd.reset_option('display.max_columns')

In [118]:
# Create lag of Budget_pct
#pre_ts_analysis_no_2022['Budget_pct_lag1'] = pre_ts_analysis_no_2022.groupby('Precinct')['Budget_pct'].shift(1)

# List of crime columns
crime_cols = ['FELONY ASSAULT_pct', 'GRAND LARCENY_pct', 'GRAND LARCENY OF MOTOR VEHICLE_pct', 
              'MURDER & NON NEGL. MANSLAUGHTER_pct', 'RAPE_pct', 'ROBBERY_pct','Shootings_pct']

# Constructing the regression formula
independent_vars = ' + '.join(['Q("{}")'.format(col) for col in crime_cols])
regression_formula = 'Budget_pct ~ ' + independent_vars + ' + Budget_pct_lag1'

# Run the regression
model = smf.ols(formula=regression_formula, data=pre_ts_analysis)
results = model.fit()
print(results.summary())







                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.898
Model:                            OLS   Adj. R-squared:                  0.897
Method:                 Least Squares   F-statistic:                     1194.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        11:52:10   Log-Likelihood:                 5941.7
No. Observations:                1094   AIC:                        -1.187e+04
Df Residuals:                    1085   BIC:                        -1.182e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [119]:
# Create lag of Budget_pct
pre_ts_analysis['Budget_pct_lag1'] = pre_ts_analysis.groupby('Precinct')['Budget_pct'].shift(1)
pre_ts_analysis['Budget_per_capita_lag1'] = pre_ts_analysis.groupby('Precinct')['Budget_per_capita'].shift(1)


In [120]:
# Creating a lagged version of the Budget_per_capita
pre_ts_analysis['Budget_per_capita_lag1'] = pre_ts_analysis.groupby('Precinct')['Budget_per_capita'].shift(1)

# Defining the regression formula
regression_formula = "Budget_per_capita ~ Q('GRAND LARCENY_per_capita') + Q('GRAND LARCENY OF MOTOR VEHICLE_per_capita') + Q('Burglary_per_capita')+ Q('FELONY ASSAULT_per_capita')+ Q('MURDER & NON NEGL. MANSLAUGHTER_per_capita')+ Q('ROBBERY_per_capita')+ Q('RAPE_per_capita')+ Budget_per_capita_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data=pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      Budget_per_capita   R-squared:                       0.941
Model:                            OLS   Adj. R-squared:                  0.941
Method:                 Least Squares   F-statistic:                     2465.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        11:52:17   Log-Likelihood:                -5123.4
No. Observations:                1240   AIC:                         1.026e+04
Df Residuals:                    1231   BIC:                         1.031e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------

In [128]:
# Creating a lagged version of the Budget_per_capita
pre_ts_analysis['Budget_per_capita_lag1'] = pre_ts_analysis.groupby('Precinct')['Budget_per_capita'].shift(1)

# Defining the regression formula
regression_formula = "Budget_per_capita ~ Q('GRAND LARCENY_per_capita_lag1') + Q('GRAND LARCENY OF MOTOR VEHICLE_per_capita_lag1') + Q('Burglary_per_capita_lag1')+ Q('FELONY ASSAULT_per_capita_lag1')+ Q('MURDER & NON NEGL. MANSLAUGHTER_per_capita_lag1')+ Q('ROBBERY_per_capita_lag1')+ Q('RAPE_per_capita_lag1')+ Budget_per_capita_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data=pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      Budget_per_capita   R-squared:                       0.941
Model:                            OLS   Adj. R-squared:                  0.940
Method:                 Least Squares   F-statistic:                     2447.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        11:56:46   Log-Likelihood:                -5127.7
No. Observations:                1240   AIC:                         1.027e+04
Df Residuals:                    1231   BIC:                         1.032e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                                           coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------

In [129]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'Burglary_pct', 'Burglary_per_capita',
       'FELONY ASSAULT_per_capita', 'FELONY ASSAULT_pct',
       'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NON NEGL. MANSLAUGHT

In [130]:
filtered_pre_ts_analysis = pre_ts_analysis[(pre_ts_analysis['Year'] >= 2007) & (pre_ts_analysis['Year'] <= 2023)]
filtered_pre_ts_analysis_orign = pre_ts_analysis[(pre_ts_analysis['Year'] >= 2008) & (pre_ts_analysis['Year'] <= 2021)]

In [131]:
Test_compare = pre_ts_analysis.loc[pre_ts_analysis['Year'] == 2015, ['Precinct','Budget_pct_lag1', 'Budget_pct'] ]
Test_compare

Test_compare = pre_ts_analysis.loc[pre_ts_analysis['Year'] == 2010, ['Precinct','RAPE_pct_lag1', 'RAPE_pct'] ]
Test_compare

Unnamed: 0,Precinct,RAPE_pct_lag1,RAPE_pct
292,1,0.003356,0.004498
293,5,0.005034,0.007496
294,6,0.007550,0.009745
295,7,0.009228,0.008246
296,9,0.013423,0.010495
...,...,...,...
360,114,0.018456,0.011994
361,115,0.028523,0.024738
362,120,0.036913,0.032234
363,122,0.007550,0.013493


In [132]:
# Creating a lagged version of the Violent and Property Crime

# Defining the regression formula
regression_formula = "Budget_per_capita ~ Q('Violent Crime_per_capita_lag1') + Q('Property Crime_per_capita_lag1') + Budget_per_capita_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data=pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      Budget_per_capita   R-squared:                       0.941
Model:                            OLS   Adj. R-squared:                  0.941
Method:                 Least Squares   F-statistic:                     6542.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        11:56:53   Log-Likelihood:                -5128.6
No. Observations:                1240   AIC:                         1.027e+04
Df Residuals:                    1236   BIC:                         1.029e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

In [133]:
# Creating a lagged version of the Violent and Property Crime pct

# Defining the regression formula
regression_formula = "Budget_pct ~ Q('Violent Crime_pct_lag1') + Q('Property Crime_pct_lag1') + Budget_pct_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data=pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.908
Model:                            OLS   Adj. R-squared:                  0.908
Method:                 Least Squares   F-statistic:                     4070.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        11:56:55   Log-Likelihood:                 6795.0
No. Observations:                1240   AIC:                        -1.358e+04
Df Residuals:                    1236   BIC:                        -1.356e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [134]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'Burglary_pct', 'Burglary_per_capita',
       'FELONY ASSAULT_per_capita', 'FELONY ASSAULT_pct',
       'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NON NEGL. MANSLAUGHT

In [135]:
# Creating a lagged version of the Violent and Property Crime pct

# Defining the regression formula
regression_formula = "Budget_pct ~ Q('RAPE_pct_lag1') + Q('FELONY ASSAULT_pct_lag1') + Q('MURDER & NON NEGL. MANSLAUGHTER_pct_lag1') + Q('ROBBERY_pct_lag1') + Budget_pct_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data=pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.909
Model:                            OLS   Adj. R-squared:                  0.909
Method:                 Least Squares   F-statistic:                     2462.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        11:57:03   Log-Likelihood:                 6800.4
No. Observations:                1240   AIC:                        -1.359e+04
Df Residuals:                    1234   BIC:                        -1.356e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [136]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'Burglary_pct', 'Burglary_per_capita',
       'FELONY ASSAULT_per_capita', 'FELONY ASSAULT_pct',
       'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NON NEGL. MANSLAUGHT

In [137]:
# Assuming 'filtered_pre_ts_analysis' already contains 'Violent Crime_pct_lag1' and 'Property Crime_pct_lag1'
# Make sure there are no zero or negative values in these columns before log-transforming
pre_ts_analysis['log_Budget_pct'] = np.log(pre_ts_analysis['Budget_pct'])  # Adding 1 to avoid log(0)
pre_ts_analysis['log_Violent_Crime_pct_lag1'] = np.log1p(pre_ts_analysis['Violent Crime_pct_lag1'])  # Adjust if necessary
pre_ts_analysis['log_Budget_pct_lag1'] = np.log(pre_ts_analysis['Budget_pct_lag1'])  # Adding 1 to avoid log(0)
pre_ts_analysis['log_Property_Crime_pct_lag1'] = np.log1p(pre_ts_analysis['Property Crime_pct_lag1'])  # Adjust if necessary

pre_ts_analysis['FELONY_ASSAULT_pct_transform_log_lag1'] = np.log1p(pre_ts_analysis['FELONY ASSAULT_pct_lag1']) 
pre_ts_analysis['MURDER_NON_NEGL_MANSLAUGHTER_pct_transform_log_lag1'] = np.log1p(pre_ts_analysis['MURDER & NON NEGL. MANSLAUGHTER_pct_lag1']) 
pre_ts_analysis['RAPE_pct_transform_log_lag1'] = np.log1p(pre_ts_analysis['RAPE_pct_lag1']) 
pre_ts_analysis['ROBBERY_pct_transform_log_lag1'] = np.log1p(pre_ts_analysis['ROBBERY_pct_lag1']) 
# Adjusting the regression formula to use the log-transformed variables
regression_formula_log = "log_Budget_pct ~ log_Violent_Crime_pct_lag1 + log_Property_Crime_pct_lag1 + log_Budget_pct_lag1"

# Running the regression
model_log = smf.ols(formula=regression_formula_log, data=pre_ts_analysis).fit()

# Printing the summary of the model
print(model_log.summary())

                            OLS Regression Results                            
Dep. Variable:         log_Budget_pct   R-squared:                       0.925
Model:                            OLS   Adj. R-squared:                  0.925
Method:                 Least Squares   F-statistic:                     5080.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        11:57:08   Log-Likelihood:                 1716.7
No. Observations:                1240   AIC:                            -3425.
Df Residuals:                    1236   BIC:                            -3405.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [138]:
X = filtered_pre_ts_analysis[['Violent Crime_pct_lag1', 'Property Crime_pct_lag1', 'Budget_pct_lag1']]
y = filtered_pre_ts_analysis['Budget_pct']
X = sm.add_constant(filtered_pre_ts_analysis[['Violent Crime_pct_lag1', 'Property Crime_pct_lag1', 'Budget_pct_lag1']])

# Fit the OLS regression model
model_4 = sm.OLS(y, X).fit()
print(model_4.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.908
Model:                            OLS   Adj. R-squared:                  0.908
Method:                 Least Squares   F-statistic:                     4070.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        11:57:11   Log-Likelihood:                 6795.0
No. Observations:                1240   AIC:                        -1.358e+04
Df Residuals:                    1236   BIC:                        -1.356e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [143]:
# Assuming 'filtered_pre_ts_analysis' is your DataFrame with proper columns.
X = filtered_pre_ts_analysis[['Violent Crime_pct_lag1', 'Property Crime_pct_lag1', 'Budget_pct_lag1']]
y = filtered_pre_ts_analysis['Budget_pct']

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X_standardized = pd.DataFrame(X_standardized, columns=X.columns, index=X.index)  # Match the indices

# Standardize the dependent variable
y_mean = y.mean()
y_std = y.std()
y_standardized = (y - y_mean) / y_std
y_standardized.index = X.index  # Ensure the indices are the same

# Add a constant to the model (intercept)
X_standardized = sm.add_constant(X_standardized)  # This will now retain the column names and index

# Fit the OLS model with standardized variables
model_standardized = sm.OLS(y_standardized, X_standardized).fit()

# Fit the OLS regression model
print(model_standardized.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.908
Model:                            OLS   Adj. R-squared:                  0.908
Method:                 Least Squares   F-statistic:                     4070.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        12:17:12   Log-Likelihood:                -279.12
No. Observations:                1240   AIC:                             566.2
Df Residuals:                    1236   BIC:                             586.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                    1

In [144]:
# Assuming 'filtered_pre_ts_analysis' is your DataFrame with proper columns.
X = filtered_pre_ts_analysis[['Violent Crime_pct_lag1', 'Property Crime_pct_lag1', 'Budget_pct_lag1']]
y = filtered_pre_ts_analysis['Budget_pct']

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X_standardized = pd.DataFrame(X_standardized, columns=X.columns, index=X.index)  # Match the indices

# Standardize the dependent variable
y_mean = y.mean()
y_std = y.std()
y_standardized = (y - y_mean) / y_std
y_standardized.index = X.index  # Ensure the indices are the same

# Add a constant to the model (intercept)
X_standardized = sm.add_constant(X_standardized)  # This will now retain the column names and index

# Fit the OLS model with standardized variables
model_standardized = sm.OLS(y_standardized, X_standardized).fit()

# Fit the OLS regression model
print(model_standardized.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.908
Model:                            OLS   Adj. R-squared:                  0.908
Method:                 Least Squares   F-statistic:                     4070.
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        12:56:52   Log-Likelihood:                -279.12
No. Observations:                1240   AIC:                             566.2
Df Residuals:                    1236   BIC:                             586.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                    1

In [140]:
Test_compare = pre_ts_analysis.loc[pre_ts_analysis['Year'] == 2010, ['Precinct','MURDER_NON_NEGL_MANSLAUGHTER_pct_transform_log_lag1', 'RAPE_pct_transform_log_lag1'] ]
Test_compare

Unnamed: 0,Precinct,MURDER_NON_NEGL_MANSLAUGHTER_pct_transform_log_lag1,RAPE_pct_transform_log_lag1
292,1,0.004274,0.003350
293,5,0.002139,0.005021
294,6,0.002139,0.007522
295,7,0.004274,0.009186
296,9,0.008529,0.013334
...,...,...,...
360,114,0.008529,0.018288
361,115,0.016986,0.028124
362,120,0.025371,0.036248
363,122,0.006403,0.007522


In [None]:
pre_ts

In [77]:
predictors_df_2 = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','Burglary_pct_lag1','GRAND LARCENY OF MOTOR VEHICLE_pct_lag1','FELONY ASSAULT_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1','RAPE_pct_lag1', 'ROBBERY_pct_lag1', 'Budget_pct_lag1']]
# Add a constant to the model (intercept)

# Calculate VIF for each predictor variable
vif = pd.DataFrame()
vif["Variable"] = predictors_df_2.columns
vif["VIF"] = [variance_inflation_factor(predictors_df_2.values, i) for i in range(predictors_df_2.shape[1])]

print(vif)

                                   Variable        VIF
0                    GRAND LARCENY_pct_lag1   7.528083
1                         Burglary_pct_lag1  16.414193
2   GRAND LARCENY OF MOTOR VEHICLE_pct_lag1   8.663947
3                   FELONY ASSAULT_pct_lag1  32.980409
4  MURDER & NON NEGL. MANSLAUGHTER_pct_lag1   6.563781
5                             RAPE_pct_lag1  13.413026
6                          ROBBERY_pct_lag1  32.808150
7                           Budget_pct_lag1  15.690278


In [78]:
predictors_df_2 = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','Burglary_pct_lag1','GRAND LARCENY OF MOTOR VEHICLE_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1','RAPE_pct_lag1', 'ROBBERY_pct_lag1', 'Budget_pct_lag1']]
# Add a constant to the model (intercept)

# Calculate VIF for each predictor variable
vif = pd.DataFrame()
vif["Variable"] = predictors_df_2.columns
vif["VIF"] = [variance_inflation_factor(predictors_df_2.values, i) for i in range(predictors_df_2.shape[1])]

print(vif)

                                   Variable        VIF
0                    GRAND LARCENY_pct_lag1   7.198934
1                         Burglary_pct_lag1  16.279178
2   GRAND LARCENY OF MOTOR VEHICLE_pct_lag1   8.663947
3  MURDER & NON NEGL. MANSLAUGHTER_pct_lag1   5.720214
4                             RAPE_pct_lag1  12.592054
5                          ROBBERY_pct_lag1  17.812288
6                           Budget_pct_lag1  14.813462


In [79]:
predictors_df_2 = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','Burglary_pct_lag1','GRAND LARCENY OF MOTOR VEHICLE_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1','RAPE_pct_lag1', 'Budget_pct_lag1']]
# Add a constant to the model (intercept)

# Calculate VIF for each predictor variable
vif = pd.DataFrame()
vif["Variable"] = predictors_df_2.columns
vif["VIF"] = [variance_inflation_factor(predictors_df_2.values, i) for i in range(predictors_df_2.shape[1])]

print(vif)

                                   Variable        VIF
0                    GRAND LARCENY_pct_lag1   7.167830
1                         Burglary_pct_lag1  15.273549
2   GRAND LARCENY OF MOTOR VEHICLE_pct_lag1   8.619399
3  MURDER & NON NEGL. MANSLAUGHTER_pct_lag1   4.581264
4                             RAPE_pct_lag1   9.823856
5                           Budget_pct_lag1  14.768607


In [80]:
predictors_df_2 = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','GRAND LARCENY OF MOTOR VEHICLE_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1','RAPE_pct_lag1', 'Budget_pct_lag1']]
# Add a constant to the model (intercept)

# Calculate VIF for each predictor variable
vif = pd.DataFrame()
vif["Variable"] = predictors_df_2.columns
vif["VIF"] = [variance_inflation_factor(predictors_df_2.values, i) for i in range(predictors_df_2.shape[1])]

print(vif)

                                   Variable        VIF
0                    GRAND LARCENY_pct_lag1   6.026963
1   GRAND LARCENY OF MOTOR VEHICLE_pct_lag1   5.584618
2  MURDER & NON NEGL. MANSLAUGHTER_pct_lag1   4.570835
3                             RAPE_pct_lag1   9.753610
4                           Budget_pct_lag1  14.099469


In [81]:
predictors_df_2 = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','GRAND LARCENY OF MOTOR VEHICLE_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1','RAPE_pct_lag1']]
# Add a constant to the model (intercept)

# Calculate VIF for each predictor variable
vif = pd.DataFrame()
vif["Variable"] = predictors_df_2.columns
vif["VIF"] = [variance_inflation_factor(predictors_df_2.values, i) for i in range(predictors_df_2.shape[1])]

print(vif)

                                   Variable       VIF
0                    GRAND LARCENY_pct_lag1  3.188364
1   GRAND LARCENY OF MOTOR VEHICLE_pct_lag1  5.426184
2  MURDER & NON NEGL. MANSLAUGHTER_pct_lag1  4.180618
3                             RAPE_pct_lag1  8.294728


In [91]:
X = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1','RAPE_pct_lag1']]
y = filtered_pre_ts_analysis['Budget_pct']
X = sm.add_constant(filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1','RAPE_pct_lag1']])

# Fit the OLS regression model
model_4 = sm.OLS(y, X).fit()
print(model_4.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.593
Model:                            OLS   Adj. R-squared:                  0.592
Method:                 Least Squares   F-statistic:                     600.4
Date:                Mon, 08 Apr 2024   Prob (F-statistic):          1.09e-240
Time:                        22:59:10   Log-Likelihood:                 5872.5
No. Observations:                1240   AIC:                        -1.174e+04
Df Residuals:                    1236   BIC:                        -1.172e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [142]:
# Assuming 'filtered_pre_ts_analysis' is your DataFrame with proper columns.
X = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1','RAPE_pct_lag1']]
y = filtered_pre_ts_analysis['Budget_pct']

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X_standardized = pd.DataFrame(X_standardized, columns=X.columns, index=X.index)  # Match the indices

# Standardize the dependent variable
y_mean = y.mean()
y_std = y.std()
y_standardized = (y - y_mean) / y_std
y_standardized.index = X.index  # Ensure the indices are the same

# Add a constant to the model (intercept)
X_standardized = sm.add_constant(X_standardized)  # This will now retain the column names and index

# Fit the OLS model with standardized variables
model_standardized = sm.OLS(y_standardized, X_standardized).fit()

# Fit the OLS regression model
print(model_standardized.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.593
Model:                            OLS   Adj. R-squared:                  0.592
Method:                 Least Squares   F-statistic:                     600.4
Date:                Tue, 09 Apr 2024   Prob (F-statistic):          1.09e-240
Time:                        12:10:47   Log-Likelihood:                -1201.6
No. Observations:                1240   AIC:                             2411.
Df Residuals:                    1236   BIC:                             2432.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [82]:
predictors_df_2 = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','GRAND LARCENY OF MOTOR VEHICLE_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1']]
# Add a constant to the model (intercept)

# Calculate VIF for each predictor variable
vif = pd.DataFrame()
vif["Variable"] = predictors_df_2.columns
vif["VIF"] = [variance_inflation_factor(predictors_df_2.values, i) for i in range(predictors_df_2.shape[1])]

print(vif)

                                   Variable       VIF
0                    GRAND LARCENY_pct_lag1  2.813753
1   GRAND LARCENY OF MOTOR VEHICLE_pct_lag1  4.238676
2  MURDER & NON NEGL. MANSLAUGHTER_pct_lag1  2.640561


In [88]:
X = filtered_pre_ts_analysis[['GRAND LARCENY_pct_lag1','GRAND LARCENY OF MOTOR VEHICLE_pct_lag1','MURDER & NON NEGL. MANSLAUGHTER_pct_lag1']]
y = filtered_pre_ts_analysis['Budget_pct']

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Standardize the dependent variable
y_standardized = (y - y.mean()) / y.std()

# Add a constant to the model (intercept)
X_standardized = sm.add_constant(X_standardized)

# Fit the OLS model with standardized variables
model_standardized = sm.OLS(y_standardized, X_standardized).fit()

# Fit the OLS regression model
print(model_standardized.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.521
Model:                            OLS   Adj. R-squared:                  0.520
Method:                 Least Squares   F-statistic:                     448.7
Date:                Mon, 08 Apr 2024   Prob (F-statistic):          3.85e-197
Time:                        22:55:31   Log-Likelihood:                -1302.2
No. Observations:                1240   AIC:                             2612.
Df Residuals:                    1236   BIC:                             2633.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.353e-16      0.020   6.88e-15      1.0

In [72]:
predictors_df_2 = filtered_pre_ts_analysis[['RAPE_pct_lag1', 'ROBBERY_pct_lag1', 'Budget_pct_lag1']]
# Add a constant to the model (intercept)

# Calculate VIF for each predictor variable
vif = pd.DataFrame()
vif["Variable"] = predictors_df_2.columns
vif["VIF"] = [variance_inflation_factor(predictors_df_2.values, i) for i in range(predictors_df_2.shape[1])]

print(vif)

           Variable        VIF
0     RAPE_pct_lag1  11.704213
1  ROBBERY_pct_lag1  12.455641
2   Budget_pct_lag1   7.657026


In [None]:
# Assuming you have a DataFrame 'pre_ts_analysis' with the log-transformed variables
variables = filtered_pre_ts_analysis[['log_Violent_Crime_pct_lag1', 'log_Property_Crime_pct_lag1', 'log_Budget_pct_lag1']]
# Adding a constant is necessary when calculating VIFs
variables_with_constant = sm.add_constant(variables)

# Calculate VIFs for each variable
vif_data = pd.DataFrame()
vif_data["Variable"] = variables_with_constant.columns
vif_data["VIF"] = [variance_inflation_factor(variables_with_constant.values, i) for i in range(variables_with_constant.shape[1])]

print(vif_data)

In [None]:
pre_ts_analysis

In [231]:
df = pre_ts_analysis
df['violent_crime_pct_transform_log_lag1'] = np.log(pre_ts_analysis['Violent Crime_pct_lag1'])
df['property_crime_pct_transform_log_lag1'] = np.log(pre_ts_analysis['Property Crime_pct_lag1'])
df['Budget_pct_transform_log'] = np.log(pre_ts_analysis['Budget_pct'])
df['Budget_pct_transform_log_lag1'] = np.log(pre_ts_analysis['Budget_pct_lag1'])


df['FELONY_ASSAULT_pct_transform_log_lag1'] = np.log(pre_ts_analysis['FELONY ASSAULT_pct_lag1']) 
df['MURDER_NON_NEGL_MANSLAUGHTER_pct_transform_log_lag1'] = np.log(pre_ts_analysis['MURDER & NON NEGL. MANSLAUGHTER_pct_lag1']) 
df['RAPE_pct_transform_log_lag1'] = np.log(pre_ts_analysis['RAPE_pct_lag1']) 
df['ROBBERY_pct_transform_log_lag1'] = np.log(pre_ts_analysis['ROBBERY_pct_lag1']) 




  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [229]:
pre_ts_analysis.columns

Index(['Year', 'Precinct', 'Full Time Positions', 'Budget', 'Borough',
       'Citizen_2010', 'Poverty Rate_2010', 'Immigration Rates 2010',
       'Budget per Capita', 'MURDER & NON NEGL. MANSLAUGHTER', 'RAPE',
       'ROBBERY', 'FELONY ASSAULT', 'BURGLARY', 'GRAND LARCENY',
       'GRAND LARCENY OF MOTOR VEHICLE', 'TOTAL SEVEN MAJOR FELONY OFFENSES',
       'Shootings', 'burglary_by_year', 'felony assault_by_year',
       'grand larceny_by_year', 'grand larceny of motor vehicle_by_year',
       'murder & non negl. manslaughter_by_year', 'rape_by_year',
       'robbery_by_year', 'total seven major felony offenses_by_year',
       'shootings_by_year', 'Burglary_pct', 'Burglary_per_capita',
       'FELONY ASSAULT_per_capita', 'FELONY ASSAULT_pct',
       'GRAND LARCENY_per_capita', 'GRAND LARCENY_pct',
       'GRAND LARCENY OF MOTOR VEHICLE_per_capita',
       'GRAND LARCENY OF MOTOR VEHICLE_pct',
       'MURDER & NON NEGL. MANSLAUGHTER_per_capita',
       'MURDER & NON NEGL. MANSLAUGHT

In [223]:
filtered_pre_ts_analysis = pre_ts_analysis[(pre_ts_analysis['Year'] >= 2006) & (pre_ts_analysis['Year'] <= 2021)]


In [224]:
# Creating a lagged version of the Violent and Property Crime pct

# Defining the regression formula
regression_formula = "Budget_pct ~ Q('RAPE_pct_lag1') + Q('FELONY ASSAULT_pct_lag1') + Q('MURDER & NON NEGL. MANSLAUGHTER_pct_lag1') + Q('ROBBERY_pct_lag1') + Budget_pct_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data=filtered_pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.898
Model:                            OLS   Adj. R-squared:                  0.897
Method:                 Least Squares   F-statistic:                     1912.
Date:                Sun, 07 Apr 2024   Prob (F-statistic):               0.00
Time:                        22:19:18   Log-Likelihood:                 5940.6
No. Observations:                1094   AIC:                        -1.187e+04
Df Residuals:                    1088   BIC:                        -1.184e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [254]:
# Creating a lagged version of the Violent and Property Crime pct

# Defining the regression formula
regression_formula = "Budget_pct ~ Q('RAPE_pct_lag1') +  Q('MURDER & NON NEGL. MANSLAUGHTER_pct_lag1') + Q('ROBBERY_pct_lag1') + Budget_pct_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data= pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.908
Model:                            OLS   Adj. R-squared:                  0.908
Method:                 Least Squares   F-statistic:                     3052.
Date:                Sun, 07 Apr 2024   Prob (F-statistic):               0.00
Time:                        23:35:04   Log-Likelihood:                 6795.3
No. Observations:                1240   AIC:                        -1.358e+04
Df Residuals:                    1235   BIC:                        -1.355e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [255]:
# Creating a lagged version of the Violent and Property Crime pct

# Defining the regression formula
regression_formula = "Budget_pct ~ Q('RAPE_pct_lag1') +  Q('MURDER & NON NEGL. MANSLAUGHTER_pct_lag1') + Budget_pct_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data= pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.908
Model:                            OLS   Adj. R-squared:                  0.908
Method:                 Least Squares   F-statistic:                     4066.
Date:                Sun, 07 Apr 2024   Prob (F-statistic):               0.00
Time:                        23:36:20   Log-Likelihood:                 6794.4
No. Observations:                1240   AIC:                        -1.358e+04
Df Residuals:                    1236   BIC:                        -1.356e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [225]:
# Creating a lagged version of the Violent and Property Crime pct

# Defining the regression formula
regression_formula = "Budget_pct ~ Q('Violent Crime_pct_lag1') + Q('Property Crime_pct_lag1') + Budget_pct_lag1"

# Running the regression
model = smf.ols(formula=regression_formula, data=filtered_pre_ts_analysis).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             Budget_pct   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.897
Method:                 Least Squares   F-statistic:                     3169.
Date:                Sun, 07 Apr 2024   Prob (F-statistic):               0.00
Time:                        22:19:21   Log-Likelihood:                 5937.0
No. Observations:                1094   AIC:                        -1.187e+04
Df Residuals:                    1090   BIC:                        -1.185e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [226]:
# Assuming 'filtered_pre_ts_analysis' already contains 'Violent Crime_pct_lag1' and 'Property Crime_pct_lag1'
# Make sure there are no zero or negative values in these columns before log-transforming
filtered_pre_ts_analysis['log_Budget_pct'] = np.log(filtered_pre_ts_analysis['Budget_pct'] + 1)  # Adding 1 to avoid log(0)
filtered_pre_ts_analysis['log_Violent_Crime_pct_lag1'] = np.log(filtered_pre_ts_analysis['Violent Crime_pct_lag1'] + 1)  # Adjust if necessary
filtered_pre_ts_analysis['log_Budget_pct_lag1'] = np.log(filtered_pre_ts_analysis['Budget_pct_lag1'] + 1)  # Adding 1 to avoid log(0)

filtered_pre_ts_analysis['log_Property_Crime_pct_lag1'] = np.log(filtered_pre_ts_analysis['Property Crime_pct_lag1'] + 1)  # Adjust if necessary
# Adjusting the regression formula to use the log-transformed variables
regression_formula_log = "log_Budget_pct ~ log_Violent_Crime_pct_lag1 + log_Property_Crime_pct_lag1 + log_Budget_pct_lag1"

# Running the regression
model_log = smf.ols(formula=regression_formula_log, data=filtered_pre_ts_analysis).fit()

# Printing the summary of the model
print(model_log.summary())


                            OLS Regression Results                            
Dep. Variable:         log_Budget_pct   R-squared:                       0.898
Model:                            OLS   Adj. R-squared:                  0.897
Method:                 Least Squares   F-statistic:                     3185.
Date:                Sun, 07 Apr 2024   Prob (F-statistic):               0.00
Time:                        22:19:34   Log-Likelihood:                 5956.9
No. Observations:                1094   AIC:                        -1.191e+04
Df Residuals:                    1090   BIC:                        -1.189e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pre_ts_analysis['log_Budget_pct'] = np.log(filtered_pre_ts_analysis['Budget_pct'] + 1)  # Adding 1 to avoid log(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pre_ts_analysis['log_Violent_Crime_pct_lag1'] = np.log(filtered_pre_ts_analysis['Violent Crime_pct_lag1'] + 1)  # Adjust if necessary
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta

In [227]:
# Assuming 'filtered_pre_ts_analysis' already contains 'Violent Crime_pct_lag1' and 'Property Crime_pct_lag1'
# Make sure there are no zero or negative values in these columns before log-transforming
filtered_pre_ts_analysis['log_Budget_pct'] = np.log(filtered_pre_ts_analysis['Budget_pct'])  # Adding 1 to avoid log(0)
filtered_pre_ts_analysis['log_Violent_Crime_pct_lag1'] = np.log(filtered_pre_ts_analysis['Violent Crime_pct_lag1'])  # Adjust if necessary
filtered_pre_ts_analysis['log_Budget_pct_lag1'] = np.log(filtered_pre_ts_analysis['Budget_pct_lag1'])  # Adding 1 to avoid log(0)

filtered_pre_ts_analysis['log_Property_Crime_pct_lag1'] = np.log(filtered_pre_ts_analysis['Property Crime_pct_lag1'])  # Adjust if necessary
# Adjusting the regression formula to use the log-transformed variables
regression_formula_log = "log_Budget_pct ~ log_Violent_Crime_pct_lag1 + log_Property_Crime_pct_lag1 + log_Budget_pct_lag1"

# Running the regression
model_log = smf.ols(formula=regression_formula_log, data=filtered_pre_ts_analysis).fit()

# Printing the summary of the model
print(model_log.summary())


                            OLS Regression Results                            
Dep. Variable:         log_Budget_pct   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     3925.
Date:                Sun, 07 Apr 2024   Prob (F-statistic):               0.00
Time:                        22:22:37   Log-Likelihood:                 1452.2
No. Observations:                1094   AIC:                            -2896.
Df Residuals:                    1090   BIC:                            -2876.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pre_ts_analysis['log_Budget_pct'] = np.log(filtered_pre_ts_analysis['Budget_pct'])  # Adding 1 to avoid log(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pre_ts_analysis['log_Violent_Crime_pct_lag1'] = np.log(filtered_pre_ts_analysis['Violent Crime_pct_lag1'])  # Adjust if necessary
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user