In [1]:
import pandas as pd
from prophet import Prophet
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# Import Houston crime data and moon files
houston_data_file = 'Houston_Crime_Cleaned.csv'
moon_data_file = 'moon_data_export.csv'

# Read the data into a pandas dataframe
houston_df= pd.read_csv(houston_data_file)
moon_df = pd.read_csv(moon_data_file)

In [12]:
# Drop Time (Universal Time) column from moon data
moon_df = moon_df.drop(columns=["Time (Universal Time)"])

# Display the first 5 rows of the moon data
moon_df.head()

Unnamed: 0,Moon Phase,Date
0,Last Quarter,2023-01-15
1,New Moon,2023-01-21
2,First Quarter,2023-01-28
3,Full Moon,2023-02-05
4,Last Quarter,2023-02-13


In [13]:
# Check the first few rows of the Houston data
houston_df.head()

Unnamed: 0,Offense Date,Offense
0,2016-02-13,Aggravated Assault
1,2016-02-16,Theft
2,2016-02-19,Aggravated Assault
3,2016-02-19,Theft
4,2016-02-20,Burglary


In [14]:
# Clean up Houston crime data to only include relevant columns, Date as datetime, and Offense
# and fill in missing values

houston_crime_df = houston_df[['Offense Date', 'Offense']]

# Rename columns
houston_crime_df.columns = ['Date', 'Offense']

# Convert 'Occurred Date' to datetime format and ensure consistency (set day as smallest increment)
houston_crime_df['Date'] = pd.to_datetime(houston_crime_df['Date'], errors='coerce').dt.normalize()

# Check for cleaned Austin data
houston_crime_df.head()

Unnamed: 0,Date,Offense
0,2016-02-13,Aggravated Assault
1,2016-02-16,Theft
2,2016-02-19,Aggravated Assault
3,2016-02-19,Theft
4,2016-02-20,Burglary


In [18]:
houston_crime_df.info()
moon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185474 entries, 0 to 1185473
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype         
---  ------   --------------    -----         
 0   Date     1185474 non-null  datetime64[ns]
 1   Offense  1185474 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 18.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Moon Phase  500 non-null    object
 1   Date        500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


In [19]:
moon_limited = moon_df.drop_duplicates(subset=["Date"])
moon_limited.shape

(495, 2)

In [24]:
# Ensure 'Date' column in moon_limited is in datetime format
moon_limited['Date'] = pd.to_datetime(moon_limited['Date'], errors='coerce').dt.normalize()

# Merge dataframes and fill absent moon phase values with "Other"
merged_Houston_df = houston_crime_df.merge(
    moon_limited,
    how="left",
    on="Date"
).fillna("Other")

# Display the first 5 rows of the merged data
merged_Houston_df.info()
merged_Houston_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  moon_limited['Date'] = pd.to_datetime(moon_limited['Date'], errors='coerce').dt.normalize()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185474 entries, 0 to 1185473
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   Date        1185474 non-null  datetime64[ns]
 1   Offense     1185474 non-null  object        
 2   Moon Phase  1185474 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 27.1+ MB


Unnamed: 0,Date,Offense,Moon Phase
0,2016-02-13,Aggravated Assault,Other
1,2016-02-16,Theft,Other
2,2016-02-19,Aggravated Assault,Other
3,2016-02-19,Theft,Other
4,2016-02-20,Burglary,Other


In [26]:
# Create a dictionary of full moon and other counts for total rows (i.e. individual crimes)
moon_phase_crimes = merged_Houston_df["Moon Phase"].value_counts().to_dict()

# Replace specific moon phases with "Other"
moon_phase_crimes["Other"] = moon_phase_crimes.pop("New Moon", 0) + moon_phase_crimes.pop("First Quarter", 0) + moon_phase_crimes.pop("Last Quarter", 0)
moon_phase_crimes 

{'Other': 67716, 'Full Moon': 22846}

In [27]:
# Get a total count for non-full-moon dates (i.e. days with "Other")
other_dates = merged_Houston_df[merged_Houston_df["Moon Phase"] == "Other"]
days_other_moon = other_dates["Date"].nunique()
days_other_moon

3041

In [28]:
# Get a total count for individual crimes in each condition
days_full_moon = 124
full_moon_crimes = moon_phase_crimes.get("Full Moon")
other_moon_crimes = moon_phase_crimes.get("Other")
full_moon_crimes, other_moon_crimes 

(22846, 67716)

In [29]:
# Do the math for crime rate and percentage increase
full_crime_rate = full_moon_crimes / days_full_moon
other_crime_rate = other_moon_crimes / days_other_moon
percent_increase = (full_crime_rate / other_crime_rate - 1) * 100

print(f"The crime rate during a full moon is {full_crime_rate:.2f} against a control rate of {other_crime_rate:.2f}.")
print(f"These initial findings indicate a increased crime rate of {percent_increase:.2f}% above the base line.")

The crime rate during a full moon is 184.24 against a control rate of 22.27.
These initial findings indicate a increased crime rate of 727.40% above the base line.
