In [6]:

import pandas as pd

# Load the dataset
df = pd.read_csv('sales.csv')



# Drop irrelevant attributes
df.drop(['Row ID','Product Name'], axis=1, inplace=True)

# Check for missing values and outliers
df.isnull().sum()
df.describe()


Unnamed: 0,Sales
count,9800.0
mean,230.769059
std,626.651875
min,0.444
25%,17.248
50%,54.49
75%,210.605
max,22638.48


In [7]:
# Convert Order Date to datetime format
df['Order Date'] = pd.to_datetime(df['Order Date'],format='%d/%m/%Y')

# Create new attributes for month, quarter, and year
df['Month'] = df['Order Date'].dt.month
df['Quarter'] = df['Order Date'].dt.quarter
df['Year'] = df['Order Date'].dt.year

# Encode categorical attributes
df['State Code'] = pd.factorize(df['State'])[0]
df['Category Code'] = pd.factorize(df['Category'])[0]
df['Sub-Category Code'] = pd.factorize(df['Sub-Category'])[0]

# Drop the original categorical attributes
df.drop(['Order Date', 'State', 'Category', 'Sub-Category'], axis=1, inplace=True)


In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('Sales', axis=1), df['Sales'], test_size=0.3, random_state=42)


In [9]:
! pip install -U scikit-learn

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Instantiate the regression models
lr = LinearRegression()
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
mlp = MLPRegressor()

# Train and evaluate the models
for model in [lr, rf, gb, mlp]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f'{model.__class__.__name__}: RMSE={rmse:.2f}, R2={r2:.2f}')


LinearRegression: RMSE=782.43, R2=-0.00
RandomForestRegressor: RMSE=757.33, R2=0.06
GradientBoostingRegressor: RMSE=685.80, R2=0.23
MLPRegressor: RMSE=784.17, R2=-0.01


In [10]:
# Create a new input for future sales prediction
new_input = pd.DataFrame({'Month': [12], 'Quarter': [4], 'Year': [2022], 'State Code': [3], 'Category Code': [1], 'Sub-Category Code': [2]})

# Use the trained model to predict the sales for the new input
future_sales = gb.predict(new_input)

print(f"Predicted sales for the future date: ${future_sales[0]:.2f}")


Predicted sales for the future date: $53.52


In [11]:
# Get unique values for State Code, Category Code, and Sub-Category Code
states = df['State Code'].unique().tolist()
categories = df['Category Code'].unique().tolist()
sub_categories = df['Sub-Category Code'].unique().tolist()

print("List of unique values:")
print("State Code:", states)
print("Category Code:", categories)
print("Sub-Category Code:", sub_categories)


List of unique values:
State Code: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48]
Category Code: [0, 1, 2]
Sub-Category Code: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [12]:
# Define the months of interest
months = range(1, 13)

# Create an empty list to store the results
sales_predictions = []

# Loop over the months of interest
for month in months:
    # Create a new input data frame for the specific month
    new_input = pd.DataFrame({'Month': [month], 'Quarter': [int((month-1)/3)+1], 'Year': [2022], 'State Code': [3], 'Category Code': [1], 'Sub-Category Code': [5]})

    # Use the trained model to predict the sales for the new input
    future_sales = gb.predict(new_input)

    # Append the predicted sales for the specific month to the list of results
    sales_predictions.append((month, future_sales[0]))

# Print the predicted sales for each month of 2022
for prediction in sales_predictions:
    print(f"Predicted sales for {prediction[0]}/2022: ${prediction[1]:.2f}")


Predicted sales for 1/2022: $86.55
Predicted sales for 2/2022: $61.99
Predicted sales for 3/2022: $61.99
Predicted sales for 4/2022: $59.96
Predicted sales for 5/2022: $59.96
Predicted sales for 6/2022: $59.96
Predicted sales for 7/2022: $59.96
Predicted sales for 8/2022: $74.51
Predicted sales for 9/2022: $74.51
Predicted sales for 10/2022: $74.51
Predicted sales for 11/2022: $74.51
Predicted sales for 12/2022: $74.51


In [13]:
import pickle

# Save the model to a file
with open('sales_prediction_model.pkl', 'wb') as f:
    pickle.dump(gb, f)


# Save the model as a pickle file
# with open('model.pkl', 'wb') as file:
#     pickle.dump(model, file)
