<a href="https://colab.research.google.com/github/vijeta-redhu/ML-Projects/blob/main/Sales_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
# Load the dataset
sales_data = pd.read_csv('bigmart_sales_data.csv')

In [None]:
# Preview the data
print(sales_data.head())
print(sales_data.info())

In [None]:
# Data Cleaning
# Handle missing values
sales_data.fillna(method='ffill', inplace=True)

In [None]:
# Feature Engineering
# Convert Date to datetime and extract year and month
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data['Year'] = sales_data['Date'].dt.year
sales_data['Month'] = sales_data['Date'].dt.month

In [None]:
# Drop unnecessary columns
sales_data.drop(['Date'], axis=1, inplace=True)

# Split data into features and target variable
X = sales_data.drop('Sales', axis=1)
y = sales_data['Sales']

In [None]:
# Preprocessing Pipeline
numeric_features = ['Price', 'Year', 'Month']
categorical_features = ['Product_ID', 'Outlet_ID', 'Product_Category', 'Outlet_Location', 'Store_Size', 'Store_Type']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Define the model
model = XGBRegressor()

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

In [None]:
# Feature Importance (for XGBoost)
importances = pipeline.named_steps['model'].feature_importances_
features = pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out()

In [None]:
# Convert importances to a DataFrame
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Plot feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.show()