In [None]:
# File created for initial data cleaning and exploration

In [1]:
import pandas as pd
from prophet import Prophet
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Import Austin crime data and moon files
austin_data_file = 'Austin_Crime_Data_Source_File.csv'
moon_data_file = 'moon_data_export.csv'

# Read the data into a pandas dataframe
austin_df = pd.read_csv(austin_data_file)
moon_df = pd.read_csv(moon_data_file)

In [3]:
# Drop Time (Universal Time) column from moon data
moon_df = moon_df.drop(columns=["Time (Universal Time)"])

# Change Date column to datetime format
moon_df['Date'] = pd.to_datetime(moon_df['Date'])

# Display the first 5 rows of the moon data
moon_df.info()
moon_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Moon Phase  500 non-null    object        
 1   Date        500 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 7.9+ KB


Unnamed: 0,Moon Phase,Date
0,Last Quarter,2023-01-15
1,New Moon,2023-01-21
2,First Quarter,2023-01-28
3,Full Moon,2023-02-05
4,Last Quarter,2023-02-13


In [4]:
# Check the first 5 rows of the Austin data
austin_df.head()

Unnamed: 0,Incident Number,Highest Offense Description,Highest Offense Code,Family Violence,Occurred Date Time,Occurred Date,Occurred Time,Report Date Time,Report Date,Report Time,Location Type,Council District,APD Sector,APD District,Clearance Status,Clearance Date,UCR Category,Category Description,Census Block Group
0,2002923330284,FAMILY DISTURBANCE,3400,N,01/29/2003 05:30,01/29/2003,530.0,11/29/2002 05:30,11/29/2002,530.0,RESIDENCE / HOME,6.0,AD,3,N,,,,4530341000.0
1,2003920010029,DEADLY CONDUCT,408,N,01/01/2003 00:01,01/01/2003,1.0,01/01/2003 00:01,01/01/2003,1.0,RESIDENCE / HOME,4.0,ID,4,N,,13A,Aggravated Assault,4530021000.0
2,2003920010046,BURGLARY NON RESIDENCE,502,N,01/01/2003 00:02,01/01/2003,2.0,01/01/2003 00:02,01/01/2003,2.0,COMMERCIAL / OFFICE BUILDING,2.0,DA,3,,10/18/2003,220,Burglary,4530024000.0
3,2003920010048,DEADLY CONDUCT,408,N,01/01/2003 00:03,01/01/2003,3.0,01/01/2003 00:03,01/01/2003,3.0,RESIDENCE / HOME,4.0,ID,2,C,01/02/2003,13A,Aggravated Assault,4530402000.0
4,2003920010079,RESISTING ARREST OR SEARCH,905,N,01/01/2003 00:06,01/01/2003,6.0,01/01/2003 00:06,01/01/2003,6.0,RESIDENCE / HOME,3.0,CH,2,C,01/01/2003,,,4530009000.0


In [None]:
# Clean up Austin crime data to only include relevant columns: Date as datetime, and Offense
# Fill in missing values and display the first 5 rows of the cleaned data

austin_crime_df = austin_df[['Occurred Date', 'Highest Offense Description']]

# Rename columns
austin_crime_df.columns = ['Date', 'Offense']

# Convert 'Occurred Date' to datetime format and ensure consistency (set day as smallest increment)
austin_crime_df['Date'] = pd.to_datetime(austin_crime_df['Date']).dt.floor('D') # Removes time from date

# Check for cleaned Austin data
austin_crime_df.info()
austin_crime_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522585 entries, 0 to 2522584
Data columns (total 2 columns):
 #   Column   Dtype         
---  ------   -----         
 0   Date     datetime64[ns]
 1   Offense  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 38.5+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  austin_crime_df['Date'] = pd.to_datetime(austin_crime_df['Date']).dt.floor('D')


Unnamed: 0,Date,Offense
0,2003-01-29,FAMILY DISTURBANCE
1,2003-01-01,DEADLY CONDUCT
2,2003-01-01,BURGLARY NON RESIDENCE
3,2003-01-01,DEADLY CONDUCT
4,2003-01-01,RESISTING ARREST OR SEARCH


In [6]:
moon_limited = moon_df.drop_duplicates(subset=["Date"])
moon_limited.shape

(495, 2)

In [7]:
# Merge dataframes and fill absent moon phase values with "Other"
merged_Austin_df = austin_crime_df.merge(
    moon_limited,
    how="left",
    on="Date"
).fillna("Other")

#Display the first 5 rows of the merged data
merged_Austin_df.info()
merged_Austin_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522585 entries, 0 to 2522584
Data columns (total 3 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Date        datetime64[ns]
 1   Offense     object        
 2   Moon Phase  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 57.7+ MB


Unnamed: 0,Date,Offense,Moon Phase
0,2003-01-29,FAMILY DISTURBANCE,Other
1,2003-01-01,DEADLY CONDUCT,Other
2,2003-01-01,BURGLARY NON RESIDENCE,Other
3,2003-01-01,DEADLY CONDUCT,Other
4,2003-01-01,RESISTING ARREST OR SEARCH,Other


In [8]:
# Create a dictionary of full moon and other counts for total rows (i.e. individual crimes)
moon_phase_crimes = merged_Austin_df["Moon Phase"].value_counts().to_dict()

# Replace specific moon phases with "Other"
moon_phase_crimes["Other"] = moon_phase_crimes.pop("New Moon", 0) + moon_phase_crimes.pop("First Quarter", 0) + moon_phase_crimes.pop("Last Quarter", 0)

In [9]:
# Get a total count for non-full-moon dates (i.e. days with "Other")
other_dates = merged_Austin_df[merged_Austin_df["Moon Phase"] == "Other"]
days_other_moon = other_dates["Date"].nunique()
days_other_moon

7552

In [10]:
# Get a total count for individual crimes in each condition
days_full_moon = 124
full_moon_crimes = moon_phase_crimes.get("Full Moon")
other_moon_crimes = moon_phase_crimes.get("Other")
full_moon_crimes, other_moon_crimes 


(33754, 101670)

In [11]:
# Do the math for crime rate and percentage increase
full_crime_rate = full_moon_crimes / days_full_moon
other_crime_rate = other_moon_crimes / days_other_moon
percent_increase = (full_crime_rate / other_crime_rate - 1) * 100

print(f"The crime rate during a full moon is {full_crime_rate:.2f} against a control rate of {other_crime_rate:.2f}.")
print(f"These initial findings indicate a increased crime rate of {percent_increase:.2f}% above the base line.")

The crime rate during a full moon is 272.21 against a control rate of 13.46.
These initial findings indicate a increased crime rate of 1921.96% above the base line.
