In [2]:
import pandas as pd
from prophet import Prophet
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Import Austin crime data and moon files
chicago_data_file = 'ChicagoCrime.csv'
moon_data_file = 'MoonPhase.csv'

# Read the data into a pandas dataframe
chicago_df = pd.read_csv(chicago_data_file)
moon_df = pd.read_csv(moon_data_file)
chicago_df

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,12045583,JD226426,05/07/2020 10:24:00 AM,035XX S INDIANA AVE,0820,THEFT,$500 AND UNDER,APARTMENT,False,False,...,3,35,06,1178180,1881621,2020,05/14/2020 08:47:15 AM,41.830482,-87.621752,"(41.830481843, -87.621751752)"
1,12031001,JD209965,04/16/2020 05:00:00 AM,005XX W 32ND ST,0460,BATTERY,SIMPLE,APARTMENT,True,False,...,11,60,08B,1173292,1883705,2020,04/23/2020 03:45:11 PM,41.836310,-87.639624,"(41.836310224, -87.639624112)"
2,12093529,JD282112,07/01/2020 10:16:00 AM,081XX S COLES AVE,051A,ASSAULT,AGGRAVATED - HANDGUN,STREET,True,False,...,7,46,04A,1198234,1851595,2020,07/08/2020 03:41:45 PM,41.747610,-87.549179,"(41.747609555, -87.549179329)"
3,12178140,JD381597,09/27/2020 11:29:00 PM,065XX S WOLCOTT AVE,0460,BATTERY,SIMPLE,RESIDENCE - PORCH / HALLWAY,False,False,...,15,67,08B,1164812,1861251,2020,10/04/2020 03:43:55 PM,41.774878,-87.671375,"(41.77487752, -87.671374872)"
4,4144897,HL474854,07/10/2005 03:00:00 PM,062XX S ABERDEEN ST,0430,BATTERY,AGGRAVATED: OTHER DANG WEAPON,STREET,False,False,...,16,68,04B,1170050,1863524,2005,02/28/2018 03:56:25 PM,41.781003,-87.652107,"(41.781002663, -87.652107119)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680374,12767402,JF324507,07/15/2022 06:12:00 AM,013XX W 112TH ST,0810,THEFT,OVER $500,RESIDENCE - PORCH / HALLWAY,False,False,...,34,75,06,1169339,1830494,2022,01/03/2023 03:40:27 PM,41.690379,-87.655667,"(41.690379065, -87.655667273)"
680375,12749147,JF302263,07/01/2022 04:20:00 PM,011XX S CLARK ST,0860,THEFT,RETAIL THEFT,DEPARTMENT STORE,False,False,...,4,32,06,1175696,1895352,2022,01/03/2023 03:40:27 PM,41.868217,-87.630453,"(41.86821682, -87.6304532)"
680376,12664897,JF200943,04/07/2022 04:00:00 PM,025XX W LEXINGTON ST,0910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,28,28,07,1159679,1896619,2022,01/03/2023 03:40:27 PM,41.872038,-87.689220,"(41.872038417, -87.689219804)"
680377,12795947,JF358991,08/16/2022 02:45:00 PM,032XX W CONGRESS PKWY,2017,NARCOTICS,MANUFACTURE / DELIVER - CRACK,PARK PROPERTY,True,False,...,28,27,18,1154781,1897566,2022,01/03/2023 03:40:27 PM,41.874737,-87.707177,"(41.874736534, -87.707177121)"


In [4]:
# Drop Time (Universal Time) column from moon data
moon_df = moon_df.drop(columns=["Time (Universal Time)"])

# Change Date column to datetime format
moon_df['Date'] = pd.to_datetime(moon_df['Date'])

# Display the first 5 rows of the moon data
moon_df

Unnamed: 0,Moon Phase,Date
0,Last Quarter,2023-01-15
1,New Moon,2023-01-21
2,First Quarter,2023-01-28
3,Full Moon,2023-02-05
4,Last Quarter,2023-02-13
...,...,...
495,Last Quarter,2014-12-14
496,New Moon,2014-12-22
497,First Quarter,2014-12-28
498,Full Moon,2015-01-05


In [6]:
# Clean up Austin crime data to only include relevant columns: Date as datetime, and Offense
# Fill in missing values and display the first 5 rows of the cleaned data

chicago_crime_df = chicago_df[["ID", 'Date', 'Primary Type']]

# Rename columns
chicago_crime_df.columns = ['ID', 'Date', 'Offense']

# Convert 'Occurred Date' to datetime format and ensure consistency (set day as smallest increment)
chicago_crime_df['Date'] = pd.to_datetime(chicago_crime_df['Date']).dt.floor('D')

# Check for cleaned Austin data
chicago_crime_df.info()
chicago_crime_df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680379 entries, 0 to 680378
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   ID       680379 non-null  int64         
 1   Date     680379 non-null  datetime64[ns]
 2   Offense  680379 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 15.6+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chicago_crime_df['Date'] = pd.to_datetime(chicago_crime_df['Date']).dt.floor('D')


Unnamed: 0,ID,Date,Offense
0,12045583,2020-05-07,THEFT
1,12031001,2020-04-16,BATTERY
2,12093529,2020-07-01,ASSAULT
3,12178140,2020-09-27,BATTERY
4,4144897,2005-07-10,BATTERY
...,...,...,...
680374,12767402,2022-07-15,THEFT
680375,12749147,2022-07-01,THEFT
680376,12664897,2022-04-07,MOTOR VEHICLE THEFT
680377,12795947,2022-08-16,NARCOTICS


In [12]:
# Possibly redundant step to ensure crime data is only as long as moon data set
max_moon_date = moon_df["Date"].max()
min_moon_date = moon_df["Date"].min()
chicago_crime_df = chicago_crime_df[
    (chicago_crime_df["Date"] <= max_moon_date) &
    (chicago_crime_df["Date"] >= min_moon_date)]

chicago_crime_df.shape

(453796, 3)

In [13]:
moon_limited = moon_df.drop_duplicates(subset=["Date"])
moon_limited.shape

(495, 2)

In [15]:
# Merge dataframes and fill absent moon phase values with "Other"
merged_chicago_df = chicago_crime_df.merge(
    moon_limited,
    how="left",
    on="Date"
).fillna("Other")

#Display the first 5 rows of the merged data
merged_chicago_df

Unnamed: 0,ID,Date,Offense,Moon Phase
0,12045583,2020-05-07,THEFT,Full Moon
1,12031001,2020-04-16,BATTERY,Other
2,12093529,2020-07-01,ASSAULT,Other
3,12178140,2020-09-27,BATTERY,Other
4,12126129,2020-08-04,WEAPONS VIOLATION,Other
...,...,...,...,...
453791,12767402,2022-07-15,THEFT,Other
453792,12749147,2022-07-01,THEFT,Other
453793,12664897,2022-04-07,MOTOR VEHICLE THEFT,Other
453794,12795947,2022-08-16,NARCOTICS,Other


In [73]:
merged_chicago_df["Moon Phase"].value_counts()

Moon Phase
Other            864728
New Moon          34103
Last Quarter      33802
First Quarter     33765
Full Moon         33754
Name: count, dtype: int64

In [16]:
merged_chicago_df["Moon Phase"] = merged_chicago_df["Moon Phase"].replace(["New Moon", "Last Quarter", "First Quarter"], "Other")
merged_chicago_df["Moon Phase"].value_counts()

Moon Phase
Other        438229
Full Moon     15567
Name: count, dtype: int64

In [17]:
# Create a dictionary of full moon and other counts for total rows (i.e. individual crimes)
moon_phase_crimes = merged_chicago_df["Moon Phase"].value_counts().to_dict()

In [18]:
# Get a total count for non-full-moon dates (i.e. days with "Other")
crimes_other_moon = merged_chicago_df[merged_chicago_df["Moon Phase"] == "Other"]
days_other_moon = crimes_other_moon["Date"].nunique()
days_other_moon

2121

In [19]:

crimes_full_moon = merged_chicago_df[merged_chicago_df["Moon Phase"] == "Full Moon"]
days_full_moon = crimes_full_moon["Date"].nunique()
days_full_moon

76

In [20]:
# Get a total count for individual crimes in each condition
full_moon_crimes = moon_phase_crimes.get("Full Moon")
other_moon_crimes = moon_phase_crimes.get("Other")
full_moon_crimes, other_moon_crimes 


(15567, 438229)

In [21]:
# Do the math for crime rate and percentage increase
full_crime_rate = full_moon_crimes / days_full_moon
other_crime_rate = other_moon_crimes / days_other_moon
percent_increase = (full_crime_rate / other_crime_rate - 1) * 100

print(f"The crime rate during a full moon is {full_crime_rate:.2f} against a control rate of {other_crime_rate:.2f}.")
print(f"These initial findings indicate a increased crime rate of {percent_increase:.2f}% above the base line.")

The crime rate during a full moon is 204.83 against a control rate of 206.61.
These initial findings indicate a increased crime rate of -0.86% above the base line.


In [23]:
merged_chicago_df["Offense"].value_counts()

Offense
THEFT                                93944
BATTERY                              85063
CRIMINAL DAMAGE                      53321
ASSAULT                              42161
DECEPTIVE PRACTICE                   34312
MOTOR VEHICLE THEFT                  30278
OTHER OFFENSE                        29165
ROBBERY                              18657
WEAPONS VIOLATION                    18254
BURGLARY                             15053
NARCOTICS                             9910
CRIMINAL TRESPASS                     7863
OFFENSE INVOLVING CHILDREN            4054
CRIMINAL SEXUAL ASSAULT               3123
SEX OFFENSE                           2343
PUBLIC PEACE VIOLATION                1474
ARSON                                 1085
INTERFERENCE WITH PUBLIC OFFICER       857
STALKING                               784
CONCEALED CARRY LICENSE VIOLATION      365
LIQUOR LAW VIOLATION                   356
HOMICIDE                               349
PROSTITUTION                           334
INT

In [24]:
# Same thing for assault, which for this dataset is simply just called "ASSAULT"
assault_crimes_df = merged_chicago_df[merged_chicago_df["Offense"] == "ASSAULT"]
assault_crimes_df

Unnamed: 0,ID,Date,Offense,Moon Phase
2,12093529,2020-07-01,ASSAULT,Other
13,12107263,2020-07-15,ASSAULT,Other
15,12164381,2020-09-09,ASSAULT,Other
54,12171016,2020-09-20,ASSAULT,Other
59,12095243,2020-07-03,ASSAULT,Other
...,...,...,...,...
453751,12732291,2022-06-16,ASSAULT,Other
453758,12682740,2022-04-27,ASSAULT,Other
453765,12767917,2022-07-20,ASSAULT,Other
453783,12788772,2022-08-09,ASSAULT,Other


In [25]:
moon_phase_assault = assault_crimes_df["Moon Phase"].value_counts().to_dict()
moon_phase_assault

{'Other': 40674, 'Full Moon': 1487}

In [26]:
full_moon_assault = moon_phase_assault.get("Full Moon")
other_moon_assault = moon_phase_assault.get("Other")
full_moon_assault, other_moon_assault

(1487, 40674)

In [27]:
# Assault category is worth further inquiry in other datasets
full_assault_rate = full_moon_assault / days_full_moon
other_assault_rate = other_moon_assault / days_other_moon
percent_increase_a = (full_assault_rate / other_assault_rate - 1) * 100

print(f"The assault rate during a full moon is {full_assault_rate:.2f} against a control rate of {other_assault_rate:.2f}.")
print(f"These initial findings indicate an increased assault rate of {percent_increase_a:.2f}% above the baseline.")

The assault rate during a full moon is 19.57 against a control rate of 19.18.
These initial findings indicate an increased assault rate of 2.03% above the baseline.
