In [None]:
import pandas as pd
df = pd.read_csv('footfall_735.csv')

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('footfall_735.csv')

# Preprocessing
# One-hot encode categorical variables
categorical_features = ['day', 'month', 'meal_type']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Define target and features
X = data.drop(['footfall', 'DATE'], axis=1)
y = data['footfall']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', transformer),
    ('model', LinearRegression())
])

# Dictionary to store MAE scores
mae_scores = {}

# Training and evaluating models
for name, model in models.items():
    pipeline.set_params(model=model)
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    mae_scores[name] = mae


mae_scores["Poisson Regression"] = 35.2
mae_scores["ANN"] = 53.8


sorted_mae_scores = sorted(mae_scores.items(), key=lambda x: x[1])

# Highlighting the two lowest bars
highlight_colors = ['green' if item[0] in [sorted_mae_scores[0][0], sorted_mae_scores[1][0]] else 'skyblue' for item in mae_scores.items()]

plt.figure(figsize=(10, 6))
bars = plt.barh(range(len(mae_scores)), list(mae_scores.values()), align='center', color=highlight_colors)
plt.yticks(range(len(mae_scores)), list(mae_scores.keys()))
plt.ylabel('Regression Models')
plt.xlabel('Mean Absolute Error (MAE)')
plt.title('MAE Scores for Different Regression Models')

# Displaying the values on the right side of the bars
for bar, val in zip(bars, list(mae_scores.values())):
    plt.text(bar.get_width() + 0.5, bar.get_y() + bar.get_height()/2, f'{val: .2f}', ha='center', va='center')

plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
data = pd.read_csv('footfall_735.csv')

# Preprocessing
categorical_features = ['day', 'meal_type']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Define target and features
X = data.drop(['footfall', 'DATE', 'month'], axis=1)
y = data['footfall']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for Gradient Boosting
gradient_boosting_pipeline = Pipeline([
    ('preprocessor', transformer),
    ('model', GradientBoostingRegressor())
])

# Train the Gradient Boosting model
gradient_boosting_pipeline.fit(X_train, y_train)

# Extract feature importances
feature_importances = gradient_boosting_pipeline.named_steps['model'].feature_importances_

# Get feature names after one-hot encoding from the pipeline
one_hot_transformer = gradient_boosting_pipeline.named_steps['preprocessor'].named_transformers_['one_hot']
feature_names = one_hot_transformer.get_feature_names_out(input_features=categorical_features)
other_feature_names = X.drop(categorical_features, axis=1).columns
all_feature_names = np.concatenate([feature_names, other_feature_names])

# Aggregate feature importances
aggregated_importances = {}
for name, importance in zip(all_feature_names, feature_importances):
    key = 'day' if 'day' in name else 'meal_type' if 'meal_type' in name else name
    aggregated_importances[key] = aggregated_importances.get(key, 0) + importance


# Ensure the total sum of importances remains the same
total_importance = sum(aggregated_importances.values())
for key in aggregated_importances:
    aggregated_importances[key] /= total_importance

# Plotting adjusted aggregated feature importances
plt.figure(figsize=(10, 6))
plt.barh(list(aggregated_importances.keys()), list(aggregated_importances.values()))
plt.xlabel('Adjusted Aggregated Feature Importance')
plt.ylabel('Feature Categories')
plt.title('Adjusted Aggregated Feature Importances in Gradient Boosting Model')

plt.gca().invert_yaxis()
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
data = pd.read_csv('footfall_735.csv')

# Preprocessing
categorical_features = ['day', 'meal_type']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Define target and features
X = data.drop(['footfall', 'DATE', 'month'], axis=1)
y = data['footfall']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for Gradient Boosting
gradient_boosting_pipeline = Pipeline([
    ('preprocessor', transformer),
    ('model', GradientBoostingRegressor())
])

# Hyperparameter grid for GridSearchCV
param_grid = {
    'model__learning_rate': [0.1, 0.05, 0.01],
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 4, 5]
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(gradient_boosting_pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Extract feature importances from the best model
feature_importances = best_model.named_steps['model'].feature_importances_



# Visualization (Plotting adjusted aggregated feature importances)
plt.figure(figsize=(10, 6))
plt.barh(list(aggregated_importances.keys()), list(aggregated_importances.values()))
plt.xlabel('Adjusted Aggregated Feature Importance')
plt.ylabel('Feature Categories')
plt.title('Adjusted Aggregated Feature Importances in Gradient Boosting Model')

plt.gca().invert_yaxis()
plt.show()

# Additional code for displaying the best parameters from GridSearchCV
print("Best Parameters:", best_params)
# Fit the best model obtained from GridSearchCV
best_model.fit(X_train, y_train)

# Make predictions using the best model
y_pred = best_model.predict(X_test)

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Plotting predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(comparison_df['Actual'], comparison_df['Predicted'], alpha=0.6)
plt.plot(comparison_df['Actual'], comparison_df['Actual'], color='red', label='Actual = Predicted')

plt.xlabel('Actual Footfall')
plt.ylabel('Predicted Footfall')
plt.title('Actual vs Predicted Footfall')
plt.legend()
plt.show()

# Additional code for displaying the best parameters from GridSearchCV
print("Best Parameters:", best_params)




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('footfall_735.csv')

# Preprocessing
categorical_features = ['day', 'meal_type']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Define target and features
X = data.drop(['footfall', 'DATE', 'month'], axis=1)
y = data['footfall']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for Gradient Boosting
gradient_boosting_pipeline = Pipeline([
    ('preprocessor', transformer),
    ('model', GradientBoostingRegressor())
])

# Train the Gradient Boosting model
gradient_boosting_pipeline.fit(X_train, y_train)

# Predict footfall using Gradient Boosting
gb_predictions = gradient_boosting_pipeline.predict(X_test)

# Train Poisson Regression on the same data
poisson_pipeline = Pipeline([
    ('preprocessor', transformer),
    ('model', PoissonRegressor(max_iter=1000))
])

poisson_pipeline.fit(X_train, y_train)

# Predict footfall using Poisson Regression
poisson_predictions = poisson_pipeline.predict(X_test)

# Combine predictions
combined_predictions = (gb_predictions + poisson_predictions) / 2

# Calculate MAE for both models and combined predictions
gb_mae = mean_absolute_error(y_test, gb_predictions)
poisson_mae = mean_absolute_error(y_test, poisson_predictions)
combined_mae = mean_absolute_error(y_test, combined_predictions)

print(f"Gradient Boosting MAE: {gb_mae}")
print(f"Poisson Regression MAE: {poisson_mae}")
print(f"Combined Model MAE: {combined_mae}")

# Visual comparison of the predictions
plt.figure(figsize=(12, 6))
plt.plot(y_test.values, label='Actual Footfall', alpha=0.7)
plt.plot(gb_predictions, label='Gradient Boosting Predictions', alpha=0.7)
plt.plot(poisson_predictions, label='Poisson Predictions', alpha=0.7)
plt.plot(combined_predictions, label='Combined Predictions', alpha=0.7)
plt.legend()
plt.title('Comparison of Footfall Predictions')
plt.xlabel('Test Samples')
plt.ylabel('Footfall')
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt



# Prepare the data for the model
X = df.drop(['footfall', 'DATE'], axis=1)
y = df['footfall']

# One-hot encode the categorical variables

one_hot = OneHotEncoder(handle_unknown='ignore')
X_encoded = one_hot.fit_transform(X[categorical_features]).toarray()
# Updated method to get feature names
X_encoded = pd.DataFrame(X_encoded, columns=one_hot.get_feature_names_out(categorical_features))
X = X.drop(categorical_features, axis=1).reset_index(drop=True)
X_encoded = pd.concat([X_encoded, X], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Building the neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_absolute_error')

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

# Plotting the training and validation loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss During Training')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Predicting the values using the model
# Replace 'model', 'X_test', and 'y_test' with your actual model and data variables
y_pred = model.predict(X_test).flatten()

# Creating a DataFrame to compare actual vs predicted values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
mae = np.mean(np.abs(comparison_df['Actual'] - comparison_df['Predicted']))
print("Mean Absolute Error (MAE):", mae)

# Plotting the real values vs predicted values
plt.figure(figsize=(12, 6))
plt.scatter(comparison_df['Actual'], comparison_df['Predicted'], alpha=0.6)
plt.plot(comparison_df['Actual'], comparison_df['Actual'], color='red')
plt.xlabel('Actual Footfall')
plt.ylabel('Predicted Footfall')
plt.title('Actual vs Predicted Footfall')
plt.show()

# Output the DataFrame for further inspection
print(comparison_df.head())
from sklearn.metrics import r2_score
# Calculating R-squared
r_squared = r2_score(comparison_df['Actual'], comparison_df['Predicted'])
print("R-squared:", r_squared)


