In [17]:
pip install prophet


In [11]:
import pandas as pd
from prophet import Prophet
from prophet.diagnostics import performance_metrics
from sklearn.metrics import mean_squared_error
import numpy as np


In [2]:
df = pd.read_pickle("Project-3_NYC_311_Calls.pkl")


In [20]:
#Q1. What is the average number of daily complaints received in 2022?

# Convert 'Created Date' to datetime format
df['Created Date'] = pd.to_datetime(df['Created Date'])

# Filter data for 2022
df_2022 = df[df['Created Date'].dt.year == 2022]

# Resample to get daily counts of complaints
daily_complaints = df_2022.resample('D', on='Created Date').count()

# Calculate the average number of daily complaints
average_daily_complaints = daily_complaints['Unique Key'].mean()

print(f"The average number of daily complaints in 2022 was: {average_daily_complaints}")


The average number of daily complaints in 2022 was: 8684.320547945206


In [3]:
#Q2. On which single date were the maximum number of calls received?


df['Created Date'] = pd.to_datetime(df['Created Date'])

# Resample to get daily counts of complaints
daily_complaints = df.resample('D', on='Created Date').count()

# Find the date with the maximum number of complaints
max_complaints_date = daily_complaints['Unique Key'].idxmax()
max_complaints_value = daily_complaints['Unique Key'].max()

print(f"The maximum number of calls received was on {max_complaints_date.date()} with {max_complaints_value} calls.")

The maximum number of calls received was on 2020-08-04 with 24415 calls.


In [4]:
#Q3. On the date the maximum number of calls were received, what was the most important complaint type?


# Convert 'Created Date' to datetime format
df['Created Date'] = pd.to_datetime(df['Created Date'])

# Resample to get daily counts of complaints
daily_complaints = df.resample('D', on='Created Date').count()

# Find the date with the maximum number of complaints
max_complaints_date = daily_complaints['Unique Key'].idxmax()

# Filter the DataFrame for just the max complaints date
df_max_date = df[df['Created Date'].dt.date == max_complaints_date.date()]

# Find the most common complaint type on this date
most_common_complaint = df_max_date['Complaint Type'].value_counts().idxmax()
complaint_count = df_max_date['Complaint Type'].value_counts().max()

print(f"On {max_complaints_date.date()}, the most common complaint type was '{most_common_complaint}' with {complaint_count} occurrences.")


On 2020-08-04, the most common complaint type was 'Damaged Tree' with 14863 occurrences.


In [5]:
#Q4. Quietest month: Group the data by months, and identify the month that historically has the fewest number of calls.

# Convert 'Created Date' to datetime format if it hasn't been converted
df['Created Date'] = pd.to_datetime(df['Created Date'])

# Group the data by month (ignoring the year) and count the number of complaints each month
monthly_complaints = df.groupby(df['Created Date'].dt.month).count()

# Find the month with the fewest calls
quietest_month = monthly_complaints['Unique Key'].idxmin()
quietest_month_count = monthly_complaints['Unique Key'].min()

# You might want to convert month number to month name for readability
import calendar
month_name = calendar.month_name[quietest_month]

print(f"The quietest month historically is {month_name} with {quietest_month_count} calls.")


The quietest month historically is December with 2596986 calls.


In [6]:
#Q5. Resample your time series to a daily frequency.  
#Perform ETS decomposition based on an additive model.  
#What is the value of the seasonal component on 2020-12-25 (rounded to the nearest integer)?


import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose

# Load your dataset
df = pd.read_pickle("Project-3_NYC_311_Calls.pkl")

# Ensure 'Created Date' is in datetime format
df['Created Date'] = pd.to_datetime(df['Created Date'])

# Resample the data to daily frequency, counting the number of complaints per day
daily_data = df.resample('D', on='Created Date').count()['Unique Key']

# Perform ETS decomposition on the daily data
decomposition = seasonal_decompose(daily_data, model='additive', period=365)  # Assuming yearly seasonality

# Extract the seasonal component
seasonal = decomposition.seasonal

# Find the seasonal component for December 25, 2020
seasonal_value = seasonal['2020-12-25']

print(f"The seasonal component on 2020-12-25 is approximately {round(seasonal_value)}")



The seasonal component on 2020-12-25 is approximately -1106


In [8]:
#Q6. Calculate the autocorrelation of the number of daily calls with the number of calls the day prior, ie lag of 1.  (Use the daily series).

 

# Ensure 'Created Date' is in datetime format
df['Created Date'] = pd.to_datetime(df['Created Date'])

# Resample the data to daily frequency, counting the number of complaints per day
daily_data = df.resample('D', on='Created Date').count()['Unique Key']

# Calculate autocorrelation with a lag of 1
autocorrelation_lag_1 = daily_data.autocorr(lag=1)

print(f"The autocorrelation of the number of daily calls with the number of calls the day prior (lag of 1) is: {autocorrelation_lag_1}")


The autocorrelation of the number of daily calls with the number of calls the day prior (lag of 1) is: 0.7517059728398577


In [12]:
#Q7.Forecast the daily series with a test set of 90 days using the Prophet library. 
#What is your RMSE on your test set?



# Resample the data to daily frequency, counting the number of complaints per day
df_daily = df.resample('D', on='Created Date').count()['Unique Key'].reset_index()
df_daily.columns = ['ds', 'y']


In [13]:
# Define the point to split the data - last 90 days as test set
split_date = df_daily['ds'].max() - pd.Timedelta(days=90)

# Create training and test sets
train = df_daily[df_daily['ds'] <= split_date]
test = df_daily[df_daily['ds'] > split_date]

In [14]:
# Initialize the Model
model = Prophet()
model.fit(train)

20:36:33 - cmdstanpy - INFO - Chain [1] start processing
20:36:35 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x7f98f5ab8a90>

In [15]:
# Make a dataframe for predictions
future = test.drop('y', axis=1)

# Use the model to make predictions
forecast = model.predict(future)

In [16]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test['y'], forecast['yhat']))
print(f"RMSE on the test set is: {rmse}")


RMSE on the test set is: 1233.7312313093698
