In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../footfall_735.csv')
data.head()

In [None]:
# dropping Unnamed: 0 column
data.drop('Unnamed: 0.1', axis=1, inplace=True)
data.drop('Unnamed: 0', axis=1, inplace=True)
data.head()

In [None]:
data['date'] = pd.to_datetime(data['DATE'])
data.head()

In [None]:
data.drop('DATE', axis=1, inplace=True)

In [None]:
data2 = data.drop('month', axis=1)

In [None]:
# one hot encoding for day of week
data2 = pd.get_dummies(data, columns=['day'])
# one hot encoding for meal_type 
data2 = pd.get_dummies(data2, columns=['meal_type'])

In [None]:
data2.head()

In [None]:
data3 = data2.drop('date', axis=1)

In [None]:
data3 = data3.drop('month', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
# splitting the data into train and test
X = data3.drop('footfall', axis=1)
y = data3['footfall']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# training the model
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
# predicting the test data
predictions = lm.predict(X_test)

# calculating the mean squared error
mse = mean_squared_error(y_test, predictions)
print('Mean Squared Error: ', mse)

# calculating the mean absolute error
mae = mean_absolute_error(y_test, predictions)
print('Mean Absolute Error: ', mae)

In [None]:
# calculating the root mean squared error
rmse = np.sqrt(mse)
print('Root Mean Squared Error: ', rmse)

In [None]:
lm.coef_

In [None]:
# getting the coefficients of the model
coefficients = pd.DataFrame(lm.coef_, X.columns)
coefficients.columns = ['Coefficients']
coefficients

In [None]:
# plotting the regression line for the model
plt.scatter(y_test, predictions)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
# plotting the residuals
sns.distplot((y_test - predictions), bins=50)
plt.show()

In [None]:
# using a poisson regression model
import statsmodels.api as sm
from statsmodels.formula.api import glm

In [None]:
# creating the poissson regression model

model = glm(formula='footfall ~ day_Monday + day_Tuesday + day_Wednesday + bogo + paneer + day_Thursday + day_Friday + guest + test + day_Saturday + max_possible_footfall + day_Sunday + meal_type_Breakfast + meal_type_Dinner + meal_type_Lunch', data=data3, family=sm.families.Poisson()).fit()

print(model.summary())

In [None]:
# printing the mean squared error
mse = mean_squared_error(y_test, model.predict(X_test))
print('Mean Squared Error: ', mse)

# printing the mean absolute error
mae = mean_absolute_error(y_test, model.predict(X_test))
print('Mean Absolute Error: ', mae)

In [None]:
print('Root Mean Squared Error: ', np.sqrt(mse))

In [None]:
model2 = glm(formula='footfall ~ bogo + paneer + day_Sunday + day_Saturday + guest + test + max_possible_footfall + meal_type_Breakfast + meal_type_Dinner + meal_type_Lunch', data=data3, family=sm.families.Poisson()).fit()

print(model2.summary())

In [None]:
# printing the mean squared error
mse2 = mean_squared_error(y_test, model2.predict(X_test))
print('Mean Squared Error: ', mse2)

# printing the mean absolute error
mae2 = mean_absolute_error(y_test, model2.predict(X_test))
print('Mean Absolute Error: ', mae2)

In [None]:
print('Root Mean Squared Error: ', np.sqrt(mse))

In [None]:
# using the coefficients of the model to plot the regression line
predictions = model2.predict(X_test)
plt.scatter(y_test, predictions)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
# plotting the residuals
sns.distplot((y_test - predictions), bins=50)
plt.show()

In [None]:
# getting the days where the predicted footfall was off by more than 38
data3['predicted_footfall'] = model2.predict(X)
data3['difference'] = data3['predicted_footfall'] - data3['footfall']
data3['difference'] = data3['difference'].abs()
data3[data3['difference'] < 38]

In [None]:
(len(data3[data3['difference'] < 38]) / len(data3)) * 100

In [None]:
# getting the impact of the variables on the footfall
coefficients = pd.DataFrame(model2.params, X.columns)
coefficients.columns = ['Coefficients']

In [None]:
# saving the model to a pickle file
import joblib
joblib.dump(model2, 'poissonreg.pkl')

In [None]:
# printing the datatypes of the columns
data3.dtypes

In [None]:
# mean mae for the model
abs(data3['difference']).mean()

In [None]:
import seaborn as sns

In [None]:
columns_to_drop = ['footfall', 'predicted_footfall', 'difference', 'max_possible_footfall']
for_corr = data3.drop(columns_to_drop, axis=1)

In [None]:
plt.figure(figsize=(10, 5))
sns.heatmap(for_corr.corr())

In [None]:
# randomly oversampling to increase the number of rows by 1.5 times
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
# sampling to increase the number of rows by 1.5 times
X_resampled, y_resampled = ros.fit_resample(X, y)

print('Original dataset shape', len(X))
print('Resampled dataset shape', len(X_resampled))

# adding both the sample dataframes into one
X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled = pd.DataFrame(y_resampled, columns=['footfall'])
# adding the footfall column to the X_resampled dataframe
X_resampled['footfall'] = y_resampled['footfall']


# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=101)

formula = 'footfall ~ bogo + paneer + day_Sunday + day_Saturday + guest + test + max_possible_footfall + meal_type_Breakfast + meal_type_Dinner + meal_type_Lunch'

# training the model
poisson2 = glm(formula=formula, data=X_resampled, family=sm.families.Poisson()).fit()

# printing the summary of the model
print(poisson2.summary())

In [None]:
# printing the mean absolute error
mae = mean_absolute_error(y_test, poisson2.predict(X_test))
print('Mean Absolute Error: ', mae)

In [None]:
# doing exploratory data analysis
data3.head()

# plotting the footfall for each day of the week
plt.figure(figsize=(10, 5))
sns.barplot(x='day', y='footfall', data=data)
plt.title('Footfall for each day of the week')
plt.show()

In [None]:
# seeing how guest and test affect the footfall
plt.figure(figsize=(10, 5))
sns.barplot(x='guest', y='footfall', data=data)
plt.title('Guest vs Footfall')
plt.show()

In [None]:
# getting the r squared value for the model
from sklearn.metrics import r2_score
r2_score(y_test, poisson2.predict(X_test))