In [1]:
import pandas as pd

# Import and read the walmart.csv file
application_df = pd.read_csv("walmart.csv")

# Drop irrelevant columns (Store and Date)
application_df.drop(columns=['Store', 'Date'], inplace=True)

# Convert 'Holiday_Flag' to a binary target variable (1 if it's a holiday, 0 if it's not)
application_df['Is_Holiday'] = application_df['Holiday_Flag']

# Drop the 'Holiday_Flag' column since we have extracted the target variable
application_df.drop(columns=['Holiday_Flag'], inplace=True)

In [2]:
from sklearn.model_selection import train_test_split

# Separate the features and the target variable
X = application_df.drop(columns=['Is_Holiday'])
y = application_df['Is_Holiday']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
from sklearn.linear_model import LogisticRegression

# Create and train the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

In [4]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9191919191919192
Confusion Matrix:
[[1183    0]
 [ 104    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1183
           1       0.00      0.00      0.00       104

    accuracy                           0.92      1287
   macro avg       0.46      0.50      0.48      1287
weighted avg       0.84      0.92      0.88      1287



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#The logistic regression model was trained and evaluated on the Walmart sales data, with the target variable being whether a day is a holiday or not (binary classification: 1 if it's a holiday, 0 if it's not). The results of the evaluation are as follows:
##Accuracy: 0.9192
#The accuracy of the model is approximately 91.92%, which means that it correctly predicts whether a day is a holiday or not for about 91.92% of the test samples.
#Confusion Matrix: [[1183    0] [ 104    0]]
#The confusion matrix shows the counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN). In this case, the model correctly predicted 1183 non-holiday days (class 0) and 0 holiday days (class 1) as true negatives. However, it incorrectly predicted 104 holiday days as non-holiday (false negatives).
#The model performs well in predicting non-holiday days (class 0) but fails to predict any holiday days (class 1). The low recall and F1-score for class 1 indicate that the model has difficulties identifying holiday days correctly, possibly due to the class imbalance and lack of sufficient information to distinguish holiday days from non-holiday days.

In [None]:
# More robust and useful predictive model.
#To build a more useful predictive model for imbalanced datasets, we used techniques to address the class imbalance issue and optimize the model's hyperparameters.
#We used the SMOTE technique to oversample the minority class (successful instances) to balance the dataset.
#Hyperparameter tuning and feature scaling can significantly impact model performance, so experimenting with different combinations can lead to better results

In [None]:
# We used GridSearchCV which is a technique that performs an exhaustive search over a specified parameter grid to find the best hyperparameter values for a given model. It systematically trains and evaluates the model with all possible combinations of hyperparameters provided in the grid and selects the combination that yields the best performance according to a specified evaluation metric (e.g., accuracy, F1-score, etc.).

In [13]:

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [6]:
# Load the Walmart sales data
application_df = pd.read_csv("walmart.csv")

In [7]:
# Convert 'Date' column to datetime type if it's not already
application_df['Date'] = pd.to_datetime(application_df['Date'], dayfirst=True)  # Specify dayfirst=True for DD/MM/YYYY format


In [10]:
# Feature Engineering - Create additional features if applicable
# For example, you can calculate lag features for 'Weekly_Sales' to capture the previous week's sales.
application_df['Previous_Week_Sales'] = application_df['Weekly_Sales'].shift(1)


In [14]:
# Prepare the data
X = application_df[['Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Previous_Week_Sales']]
threshold_value = 1500000  # Set the threshold value to separate the classes
y = (application_df['Weekly_Sales'] > threshold_value).astype(int)

In [15]:
# Handle missing values (NaN) using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

In [16]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Handle class imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [18]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Hyperparameter Tuning - Experiment with different hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train_resampled)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  warn(


In [21]:
# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_scaled)

In [22]:
# Calculate the accuracy to evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9393939393939394


In [23]:
# Generate the confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[959  37]
 [ 41 250]]


In [24]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       996
           1       0.87      0.86      0.87       291

    accuracy                           0.94      1287
   macro avg       0.92      0.91      0.91      1287
weighted avg       0.94      0.94      0.94      1287



In [None]:
#The classification report suggests that the model performs well in predicting both classes, with good precision and recall values.
#The feature importance analysis shows the relative importance of each feature in making predictions.
#'Unemployment' is the most important feature, with a high importance value of 0.747. It plays a crucial role in determining whether the weekly sales will exceed the threshold or not.
# Features like 'Fuel_Price', 'CPI', and 'Temperature' also contribute to the model's decision-making, though to a lesser extent.
#'Weekly_Sales' itself has a very low importance value, indicating that it may not be a strong predictor for classifying the sales into the two categories.

In [25]:
# Visualize the feature importances
feature_importance = best_model.feature_importances_
print("Feature Importance:")
print(list(zip(application_df.columns[2:], feature_importance)))

Feature Importance:
[('Weekly_Sales', 0.020817000474654193), ('Holiday_Flag', 0.042408509054831046), ('Temperature', 0.045910081775601845), ('Fuel_Price', 0.07322830381295282), ('CPI', 0.07105031352314232), ('Unemployment', 0.7465857913588178)]
