In [1]:
# The sport giant has been ahead of the pack for years.

# But beyond its impressive marketing strategies and its strong digital presence, 

# how were sales in the USA over a period between 2020 & 2021?

# we are going to find out!

In [4]:
# Importing the libraries

import pandas as pd
import pandas_profiling
from pandas_profiling import ProfileReport
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [5]:
# Importing the dataset

df = pd.read_csv('Adidas.csv')

# Visualising the dataset first Five Rows

df.head()

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,operating_margin,sales_method
0,Foot Locker,1185732,2020-01-01,Northeast,New York,New York,Men's Street Footwear,50.0,1200,600000.0,300000.0,0.5,In-store
1,Foot Locker,1185732,2020-01-02,Northeast,New York,New York,Men's Athletic Footwear,50.0,1000,500000.0,150000.0,0.3,In-store
2,Foot Locker,1185732,2020-01-03,Northeast,New York,New York,Women's Street Footwear,40.0,1000,400000.0,140000.0,0.35,In-store
3,Foot Locker,1185732,2020-01-04,Northeast,New York,New York,Women's Athletic Footwear,45.0,850,382500.0,133875.0,0.35,In-store
4,Foot Locker,1185732,2020-01-05,Northeast,New York,New York,Men's Apparel,60.0,900,540000.0,162000.0,0.3,In-store


In [6]:
# Visualising the dataset last Five Rows

df.tail()

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,operating_margin,sales_method
9643,Foot Locker,1185732,2021-01-24,Northeast,New Hampshire,Manchester,Men's Apparel,50.0,64,3200.0,896.0,0.28,Outlet
9644,Foot Locker,1185732,2021-01-24,Northeast,New Hampshire,Manchester,Women's Apparel,41.0,105,4305.0,1377.6,0.32,Outlet
9645,Foot Locker,1185732,2021-02-22,Northeast,New Hampshire,Manchester,Men's Street Footwear,41.0,184,7544.0,2791.28,0.37,Outlet
9646,Foot Locker,1185732,2021-02-22,Northeast,New Hampshire,Manchester,Men's Athletic Footwear,42.0,70,2940.0,1234.8,0.42,Outlet
9647,Foot Locker,1185732,2021-02-22,Northeast,New Hampshire,Manchester,Women's Street Footwear,29.0,83,2407.0,649.89,0.27,Outlet


In [5]:
# Visualising the dataset shape

df.shape

(9648, 13)

In [6]:
# Visualising the dataset columns

df.columns

Index(['retailer', 'retailer_id', 'invoice_date', 'region', 'state', 'city',
       'product', 'price_per_unit', 'units_sold', 'total_sales',
       'operating_profit', 'operating_margin', 'sales_method'],
      dtype='object')

In [7]:
# Visualising the dataset info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9648 entries, 0 to 9647
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   retailer          9648 non-null   object 
 1   retailer_id       9648 non-null   int64  
 2   invoice_date      9648 non-null   object 
 3   region            9648 non-null   object 
 4   state             9648 non-null   object 
 5   city              9648 non-null   object 
 6   product           9648 non-null   object 
 7   price_per_unit    9648 non-null   float64
 8   units_sold        9648 non-null   int64  
 9   total_sales       9648 non-null   float64
 10  operating_profit  9648 non-null   float64
 11  operating_margin  9648 non-null   float64
 12  sales_method      9648 non-null   object 
dtypes: float64(4), int64(2), object(7)
memory usage: 980.0+ KB


In [7]:
# converting the date column to datetime

df['invoice_date'] = pd.to_datetime(df['invoice_date'])

In [9]:
# Visualising the dataset info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9648 entries, 0 to 9647
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   retailer          9648 non-null   object        
 1   retailer_id       9648 non-null   int64         
 2   invoice_date      9648 non-null   datetime64[ns]
 3   region            9648 non-null   object        
 4   state             9648 non-null   object        
 5   city              9648 non-null   object        
 6   product           9648 non-null   object        
 7   price_per_unit    9648 non-null   float64       
 8   units_sold        9648 non-null   int64         
 9   total_sales       9648 non-null   float64       
 10  operating_profit  9648 non-null   float64       
 11  operating_margin  9648 non-null   float64       
 12  sales_method      9648 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(6)
memory usage: 980.0+ KB


In [8]:
# I do not need to represent fractions of a unit, and the price per unit is always a whole number.

# In this case Ican use an int64 data type.

# This will save you me storage space, as int64 values take up less space than float64 values.

# converting the price_per_unit column to int

df['price_per_unit'] = df['price_per_unit'].astype('int64')

In [9]:
# converting the total_sales column to int

df['total_sales'] = df['total_sales'].astype('int64')

In [11]:
# converting the operating_profit column to int

df['operating_profit'] = df['operating_profit'].astype('int64')

In [10]:
# converting the operating_margin column to int

df['operating_margin'] = df['operating_margin'].astype('int64')

In [12]:
# All this changes will be significant when making our Machine Learning Model later.

# Visualising the dataset info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9648 entries, 0 to 9647
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   retailer          9648 non-null   object        
 1   retailer_id       9648 non-null   int64         
 2   invoice_date      9648 non-null   datetime64[ns]
 3   region            9648 non-null   object        
 4   state             9648 non-null   object        
 5   city              9648 non-null   object        
 6   product           9648 non-null   object        
 7   price_per_unit    9648 non-null   int64         
 8   units_sold        9648 non-null   int64         
 9   total_sales       9648 non-null   int64         
 10  operating_profit  9648 non-null   int64         
 11  operating_margin  9648 non-null   int64         
 12  sales_method      9648 non-null   object        
dtypes: datetime64[ns](1), int64(6), object(6)
memory usage: 980.0+ KB


In [13]:
# checking for missing values

df.isnull().sum()

retailer            0
retailer_id         0
invoice_date        0
region              0
state               0
city                0
product             0
price_per_unit      0
units_sold          0
total_sales         0
operating_profit    0
operating_margin    0
sales_method        0
dtype: int64

In [16]:
# There are no missing values in the dataset.

# EXPLORATORY ANALYSIS

# This is to further understand the Dataset

In [27]:
# run a pandas profiling review. 

# This will give us a quick overview of the data and help us to identify any issues.

# profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)

# profile.to_widgets()

# profile.to_file("Adidas.html")

# check the html file for better understanding

In [17]:
# From the report we can see we have a high cardinality

# in the product_name column, and a high correlation between the total_sales and the operating_profit columns.

# We can also see that the dataset is imbalanced.

# We have more data for the year 2020 than for the year 2021.

# "cardinality" refers to the number of unique values in a categorical feature. 

# A categorical feature with a high cardinality means that it has a large number of unique values. 

# This can be a problem in some cases because it can make it difficult to analyze the data and can increase the size of the dataset.

In [18]:
# "correlation" refers to the relationship between two variables.

# A correlation of 1 means that the two variables are perfectly correlated.

# A correlation of 0 means that there is no correlation between the two variables.

# A correlation of -1 means that the two variables are perfectly negatively correlated.

# "imbalance" refers to the distribution of the target variable in the dataset.

# If the target variable is imbalanced, it means that the dataset contains more data for one class than for the other.

# In this case, we have more data for the year 2020 than for the year 2021.

# Visualising the dataset describe

df.describe()

Unnamed: 0,retailer_id,price_per_unit,units_sold,total_sales,operating_profit,operating_margin
count,9648.0,9648.0,9648.0,9648.0,9648.0,9648.0
mean,1173850.0,45.184287,256.930037,93273.415216,34424.934183,0.0
std,26360.38,14.694704,214.25203,141916.006699,54193.232985,0.0
min,1128299.0,7.0,0.0,0.0,0.0,0.0
25%,1185732.0,35.0,106.0,4254.5,1921.0,0.0
50%,1185732.0,45.0,176.0,9576.0,4371.0,0.0
75%,1185732.0,55.0,350.0,150000.0,52062.0,0.0
max,1197831.0,110.0,1275.0,825000.0,390000.0,0.0


In [19]:
# Visualising the dataset describe

df.describe(include='object')

Unnamed: 0,retailer,region,state,city,product,sales_method
count,9648,9648,9648,9648,9648,9648
unique,6,5,50,52,6,3
top,Foot Locker,West,California,Portland,Men's Street Footwear,Online
freq,2637,2448,432,360,1610,4889


# VISUALISATIONS

Visualization is an important step in data analysis because it allows us to explore and understand data in a more intuitive and interactive way. 

By exploring this data visually, we can quickly identify areas that need more attention and focus our efforts accordingly.

In [20]:
# Most sales by Region

# creating a plotly figure

fig = go.Figure()

# adding a bar chart to the figure

fig.add_trace(go.Bar(x=df['region'], y=df['total_sales'], name='Total Sales'))

# updating the layout

fig.update_layout(title='Total Sales by Region', xaxis_title='Region', yaxis_title='Total Sales')

# displaying the figure


fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)

fig.show()

In [21]:
# most sales by state

# creating a plotly figure

fig = go.Figure()

# adding a bar chart to the figure

fig.add_trace(go.Bar(x=df['state'], y=df['total_sales'], name='Total Sales'))

# updating the layout

fig.update_layout(title='Total Sales by State', xaxis_title='State', yaxis_title='Total Sales')

# displaying the figure

fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)

fig.update_layout(plot_bgcolor='rgb(255,255,255)')

fig.show()

Most Sales were made in the state of New York followed closely with the state of Carlifornia.

The state which had the least amount of sales was Nebraska.

In [27]:
# most sales by product

# creating a plotly figure

fig = go.Figure()

# adding a bar chart to the figure

fig.add_trace(go.Bar(x=df['product'], y=df['total_sales'], name='Total Sales'))

# updating the layout

fig.update_layout(title='Total Sales by Product', xaxis_title='product', yaxis_title='Total Sales')

# displaying the figure

fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)

fig.show()

Men's street Footwear was the product that made most sales while the least sales can be seen in Women's Athletic Footwear.

In [28]:
# most sales by retailer

# creating a plotly figure

fig = go.Figure()

# adding a bar chart to the figure

fig.add_trace(go.Bar(x=df['retailer'], y=df['total_sales'], name='Total Sales'))

# updating the layout

fig.update_layout(title='Most Sales By Retailer', xaxis_title='retailer', yaxis_title='Total Sales')

# displaying the figure 

fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)

fig.show()

The retailer that made most sales was West Gear while least sales were made at Walmart.

In [29]:
# plotting a pie chart showing the customer preffered form of purchase

import plotly.graph_objects as go

fig = go.Figure(data=[go.Pie(labels=df['sales_method'], values=df['total_sales'])])

fig.update_traces(textposition='inside', textinfo='percent+label')

fig.update_layout(title='Top Selling Behaviour', xaxis_title='sales_method', yaxis_title='Total Sales')

fig.show()

Top selling behaviour by Retailers was In-store.

Most customers can be seen to preffer carrying out their purchases In-Stores.

In [30]:
# Total Sales Per Retailer Pie Chart

# plotting a pie chart showing the total sales per retailer

fig = go.Figure(data=[go.Pie(labels=df['retailer'], values=df['total_sales'])])

fig.update_traces(textposition='inside', textinfo='percent+label')

fig.update_layout(title='Total Sales Per Retailer', xaxis_title='retailer', yaxis_title='Total Sales')

fig.show()

West Gear was the biggest retailer with a share percentage of 27%

In [31]:
# change date time to year and months

df['year'] = pd.DatetimeIndex(df['invoice_date']).year

df['month'] = pd.DatetimeIndex(df['invoice_date']).month

In [32]:
# What was the best month of sales in 2020

# plotting a bar graph showing the best month of sales in 2020

fig = go.Figure()

fig.add_trace(go.Bar(x=df['month'], y=df['total_sales'], name='Total Sales'))

fig.update_layout(title='Best Month of Sales in 2020', xaxis_title='Month', yaxis_title='Total Sales')

fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)

fig.show()

In the year 2020, July experienced the most number of sales but there was a steady decline in August.

In [33]:
# What was the best month of sales in 2021

# plotting a bar graph showing the best month of sales in 2021

fig = go.Figure()

fig.add_trace(go.Bar(x=df['month'], y=df['total_sales'], name='Total Sales'))

fig.update_layout(title='Best Month of Sales in 2021', xaxis_title='Month', yaxis_title='Total Sales')

fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)

fig.show()

Over the span of the 2 years we can say that most sales were made in the month of July(The 7th month of the year).

# MACHINE LEARNING

In [34]:
# Overall, machine learning is a valuable tool for data science that can help you extract useful insights and patterns from your data, 

# automate tasks, and make more accurate predictions and decisions.

# Machine learning is a subset of artificial intelligence (AI) that uses statistical techniques to give computer systems the ability to "learn" 

# (e.g., progressively improve performance on a specific task) with data, without being explicitly programmed.

# Machine learning is a method of data analysis that automates analytical model building.

# It is a branch of artificial intelligence based on the idea that systems can learn from data, 
 
# identify patterns and make decisions with minimal human intervention.

# We’ll focus on converting categorical columns to numeric columns and removing any other extraneous columns.

# We’ll also use the LabelEncoder class from the scikit-learn library to convert the categorical columns to numeric columns.


In [36]:
# We need to handle categorical features before feeding the data into a machine learning algorithm, 

# because the mathematics underlying most machine learning models assumes that the data is numerical and contains no missing values. 

# To reinforce this requirement, scikit-learn will return an error if you try to train a model using data 

# that contains non-numeric values when working with models like linear regression and logistic regression.

In [35]:
# Here’s an outline of what we’ll be doing in this stage:

# -Investigate Categorical Columns
 
# -Convert Categorical Columns To Numeric Features

# -Map Ordinal Values To Integers

# -Encode Nominal Values As Dummy Variables

# -Remove Unnecessary Columns

# -Split The Data Into Training And Test Sets

# -Scale The Data


In [37]:
# We’ll start by investigating the categorical columns in our dataset.

# We’ll use the select_dtypes method to select the categorical columns.

# We’ll then use the value_counts method to display the unique values in each column.

# We’ll also use the nunique method to display the number of unique values in each column.

In [14]:
# Checking Data types Value Counts

print("Data types and their frequency\n{}".format(df.dtypes.value_counts()))

Data types and their frequency
object            6
int64             6
datetime64[ns]    1
dtype: int64


In [67]:
# We have 8 object columns that contain text which need to be converted into numeric features. 

# Let’s select just the object columns using the DataFrame method select_dtype, 

# then display a sample row to get a better sense of how the values in each column are formatted.

# Selecting Object Columns

print("Columns that contain object data type\n{}".format(df.select_dtypes(include=['object']).columns))

Columns that contain object data type
Index(['invoice_date'], dtype='object')


In [15]:
object_columns_df = df.select_dtypes(include=['object'])
print(object_columns_df.iloc[0])

retailer                  Foot Locker
region                      Northeast
state                        New York
city                         New York
product         Men's Street Footwear
sales_method                 In-store
Name: 0, dtype: object


In [16]:
# First, let’s explore the unique value counts of the columns that seem like they contain categorical values:

# -retailer

# -sales_method

# -product

# -region

# -state

# -city

cols = ['retailer', 'sales_method', 'product', 'region', 'state', 'city']
for name in cols:
    print(name,':')
    print(object_columns_df[name].value_counts(),'\n')

retailer :
Foot Locker      2637
West Gear        2374
Sports Direct    2032
Kohl's           1030
Amazon            949
Walmart           626
Name: retailer, dtype: int64 

sales_method :
Online      4889
Outlet      3019
In-store    1740
Name: sales_method, dtype: int64 

product :
Men's Street Footwear        1610
Men's Athletic Footwear      1610
Women's Street Footwear      1608
Women's Apparel              1608
Women's Athletic Footwear    1606
Men's Apparel                1606
Name: product, dtype: int64 

region :
West         2448
Northeast    2376
Midwest      1872
South        1728
Southeast    1224
Name: region, dtype: int64 

state :
California        432
Texas             432
New York          360
Florida           360
Mississippi       216
Oregon            216
Louisiana         216
Idaho             216
New Mexico        216
Georgia           216
Arkansas          216
Virginia          216
Oklahoma          216
Connecticut       216
Rhode Island      216
Massachusetts  

In [17]:
# Most of these columns contain discrete categorical values which we can encode as dummy variables and keep.

# Convert Categorical Columns to Numeric Features

# We’ll use the LabelEncoder class from the scikit-learn library to convert the categorical columns to numeric columns.

# importing the necessary libraries

from sklearn.preprocessing import LabelEncoder

# creating an instance of the LabelEncoder class

le = LabelEncoder()

In [43]:
# We’ll use the LabelEncoder class to convert the categorical columns to numeric columns.

# We’ll use the fit_transform method to convert the values in each column to numeric values.

# We’ll then use the DataFrame method assign to add the new columns to the DataFrame.

# We’ll also use the DataFrame method drop to remove the original columns from the DataFrame.

# We’ll use the DataFrame method head to display the first 5 rows of the DataFrame.

In [18]:
# Converting Categorical Columns to Numeric Features

df['retailer'] = le.fit_transform(df['retailer'])

df['sales_method'] = le.fit_transform(df['sales_method'])

df['product'] = le.fit_transform(df['product'])

df['region'] = le.fit_transform(df['region'])

df['state'] = le.fit_transform(df['state'])

df['city'] = le.fit_transform(df['city'])

In [45]:
# We’ll use the DataFrame method head to display the first 5 rows of the DataFrame.

print(df.head())

   retailer  retailer_id invoice_date  region  state  city  product  \
0         1      1185732   2020-01-01       1     31    35        2   
1         1      1185732   2020-01-02       1     31    35        1   
2         1      1185732   2020-01-03       1     31    35        5   
3         1      1185732   2020-01-04       1     31    35        4   
4         1      1185732   2020-01-05       1     31    35        0   

   price_per_unit  units_sold  total_sales  operating_profit  \
0              50        1200       600000            300000   
1              50        1000       500000            150000   
2              40        1000       400000            140000   
3              45         850       382500            133875   
4              60         900       540000            162000   

   operating_margin  sales_method  year  month  
0                 0             0  2020      1  
1                 0             0  2020      1  
2                 0             0  2020  

In [46]:
# We’ll use the DataFrame method describe to display the summary statistics for each column.

# We’ll also use the DataFrame method info to display the data types for each column.

# Displaying Summary Statistics

print(df.describe())

          retailer   retailer_id       region        state         city  \
count  9648.000000  9.648000e+03  9648.000000  9648.000000  9648.000000   
mean      2.608520  1.173850e+06     2.000000    24.223881    25.768657   
std       1.726698  2.636038e+04     1.471191    14.742644    14.883855   
min       0.000000  1.128299e+06     0.000000     0.000000     0.000000   
25%       1.000000  1.185732e+06     1.000000    10.000000    12.000000   
50%       3.000000  1.185732e+06     2.000000    25.000000    26.000000   
75%       4.000000  1.185732e+06     4.000000    37.000000    39.000000   
max       5.000000  1.197831e+06     4.000000    49.000000    51.000000   

           product  price_per_unit   units_sold    total_sales  \
count  9648.000000     9648.000000  9648.000000    9648.000000   
mean      2.499793       45.184287   256.930037   93273.415216   
std       1.707549       14.694704   214.252030  141916.006699   
min       0.000000        7.000000     0.000000       0.0000

In [47]:
# Displaying Data Types

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9648 entries, 0 to 9647
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   retailer          9648 non-null   int32         
 1   retailer_id       9648 non-null   int64         
 2   invoice_date      9648 non-null   datetime64[ns]
 3   region            9648 non-null   int32         
 4   state             9648 non-null   int32         
 5   city              9648 non-null   int32         
 6   product           9648 non-null   int32         
 7   price_per_unit    9648 non-null   int64         
 8   units_sold        9648 non-null   int64         
 9   total_sales       9648 non-null   int64         
 10  operating_profit  9648 non-null   int64         
 11  operating_margin  9648 non-null   int64         
 12  sales_method      9648 non-null   int32         
 13  year              9648 non-null   int64         
 14  month             9648 n

In [48]:
# We’ll use the DataFrame method corr to display the correlation matrix for the columns in the DataFrame.

# We’ll also use the DataFrame method corr to display the correlation matrix for the columns in the DataFrame.

# Displaying Correlation Matrix

print(df.corr())

                  retailer  retailer_id    region     state      city  \
retailer          1.000000    -0.258306  0.267482  0.020066  0.074579   
retailer_id      -0.258306     1.000000 -0.565978  0.136125 -0.157449   
region            0.267482    -0.565978  1.000000 -0.158290 -0.016701   
state             0.020066     0.136125 -0.158290  1.000000  0.095303   
city              0.074579    -0.157449 -0.016701  0.095303  1.000000   
product          -0.000256    -0.000869 -0.000165  0.000191  0.000194   
price_per_unit    0.005499    -0.392477  0.211493 -0.078254  0.004039   
units_sold        0.094452    -0.003194  0.163450  0.002320 -0.068049   
total_sales       0.058114    -0.083580  0.121705 -0.013876 -0.032194   
operating_profit  0.047003    -0.027137  0.085980 -0.003930 -0.060216   
operating_margin       NaN          NaN       NaN       NaN       NaN   
sales_method     -0.073660    -0.004176  0.136477 -0.098631 -0.008978   
year             -0.230813     0.176849 -0.156513 -

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9648 entries, 0 to 9647
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   retailer          9648 non-null   int32         
 1   retailer_id       9648 non-null   int64         
 2   invoice_date      9648 non-null   datetime64[ns]
 3   region            9648 non-null   int32         
 4   state             9648 non-null   int32         
 5   city              9648 non-null   int32         
 6   product           9648 non-null   int32         
 7   price_per_unit    9648 non-null   int64         
 8   units_sold        9648 non-null   int64         
 9   total_sales       9648 non-null   int64         
 10  operating_profit  9648 non-null   int64         
 11  operating_margin  9648 non-null   int64         
 12  sales_method      9648 non-null   int32         
 13  year              9648 non-null   int64         
 14  month             9648 n

In [50]:
# check for null values

df.isnull().sum()

retailer            0
retailer_id         0
invoice_date        0
region              0
state               0
city                0
product             0
price_per_unit      0
units_sold          0
total_sales         0
operating_profit    0
operating_margin    0
sales_method        0
year                0
month               0
dtype: int64

In [19]:
# The data is clean, in a structured format and is ready to be used for machine learning.

# Import the necessary libraries

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

from sklearn.metrics import r2_score


In [20]:
# Select the features and the target variable

X = df[['retailer', 'region', 'state', 'city', 'product', 'price_per_unit', 'units_sold', 'sales_method']]
y = df['total_sales']

# Split the data into a training set and a test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model

model = LinearRegression()

# Train the model on the training data

model.fit(X_train, y_train)

# Make predictions on the test data

y_pred = model.predict(X_test)

# Evaluate the model performance

print(f'R2 Score: {model.score(X_test, y_test):.2f}')

R2 Score: 0.88


1. Fine Tuning Our Machine Learning Model

In [21]:
# This code will use grid search to fine-tune the linear regression model by trying different combinations of the fit_intercept and normalize hyperparameters. 

# The model with the best performance on the training data will be selected as the best model and used to make predictions on the test data.

In [22]:
# importing the necessary libraries

from sklearn.model_selection import train_test_split, GridSearchCV

# Select the features and the target variable

X = df[['retailer', 'region', 'state', 'city', 'product', 'price_per_unit', 'units_sold', 'sales_method']]
y = df['total_sales']

# Split the data into a training set and a test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model

model = LinearRegression()

# Define the hyperparameter grid

param_grid = {'fit_intercept': [True, False],
              'normalize': [True, False]}

# Create a GridSearchCV object

grid_search = GridSearchCV(model, param_grid, cv=5)

# Train the model using the grid search object

grid_search.fit(X_train, y_train)

# Get the best model

best_model = grid_search.best_estimator_

# Make predictions on the test data

y_pred = best_model.predict(X_test)

# Evaluate the model performance

print(f'R2 Score: {best_model.score(X_test, y_test):.2f}')

R2 Score: 0.88


A R2 score of 0.88 means that the model explains 88% of the variance in the target variable.

Our target variable is the total sales of a retail store and the input features are the location, the products sold, and the sales method, a R2 score of 0.88 would mean that the model can explain 88% of the variance in the total sales based on these features.

2. Feature Engineering

In [55]:
# This code will create a new DataFrame with one-hot encoded versions of the retailer, region, state, city, product, and sales_method columns. 

#The one-hot encoded features and the total_sales column will be used to train a linear regression model, and the model will be evaluated using the R2 score.

In [23]:
# Import the necessary libraries

from sklearn.preprocessing import OneHotEncoder

# Create a new DataFrame with the one-hot encoded features

one_hot_df = pd.get_dummies(df, columns=['retailer', 'region', 'state', 'city', 'product', 'sales_method'])

# Select the encoded features and the target variable

X = one_hot_df.drop(columns=['total_sales', 'invoice_date'])
y = df['total_sales']

# Split the data into a training set and a test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model

model = LinearRegression()

# Train the model on the training data

model.fit(X_train, y_train)

# Make predictions on the test data

y_pred = model.predict(X_test)

# Evaluate the model performance

print(f'R2 Score: {model.score(X_test, y_test):.2f}')


R2 Score: 0.95


A R2 score of 0.95 means that the model explains 95% of the variance in the target variable.

3. Ensemble Learning

In [57]:
# This will combine multiple models to create a stronger, more robust model. 

# This can involve techniques such as bagging, boosting, or stacking.

In [58]:
# This code will use a random forest regressor to make predictions on the total_sales column based on the features in the X DataFrame (retailer, region, state, city, product, price_per_unit, units_sold, and sales_method). 

# The model will be trained on 80% of the data and tested on the remaining 20%. The performance of the model will be evaluated using the R2 score.

In [24]:
# Import the necessary libraries

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Select the features and the target variable

X = df[['retailer', 'region', 'state', 'city', 'product', 'price_per_unit', 'units_sold', 'sales_method']]
y = df['total_sales']

# Split the data into a training set and a test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor

model = RandomForestRegressor()

# Train the model on the training data

model.fit(X_train, y_train)

# Make predictions on the test data

y_pred = model.predict(X_test)

# Evaluate the model performance

print(f'R2 Score: {model.score(X_test, y_test):.2f}')


R2 Score: 1.00


An R2 score of 1.0 means that the model perfectly fits the data

4. Validation: Evaluating the model on a separate validation set to get a more accurate assessment of its performance.

In [25]:
# Select the features and the target variable

X = df[['retailer', 'region', 'state', 'city', 'product', 'price_per_unit', 'units_sold', 'sales_method']]
y = df['total_sales']

# Split the data into a training set, a validation set, and a test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create a Linear Regression model

model = LinearRegression()

# Train the model on the training data

model.fit(X_train, y_train)

# Make predictions on the validation data

y_pred = model.predict(X_val)

# Evaluate the model performance on the validation data

print(f'R2 Score: {model.score(X_val, y_val):.2f}')

# Make predictions on the test data

y_pred = model.predict(X_test)

# Evaluate the model performance on the test data

print(f'R2 Score: {model.score(X_test, y_test):.2f}')


R2 Score: 0.88
R2 Score: 0.88


5. Deployment: Deploying the model in a production environment and monitoring its performance over time to ensure that it continues to perform well.

In [26]:
# Select the features and the target variable
X = df[['retailer', 'region', 'state', 'city', 'product', 'price_per_unit', 'units_sold', 'sales_method']]
y = df['total_sales']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
model = RandomForestRegressor()

# Train the model on the training data
model.fit(X_train, y_train)

# Save the model to a file
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Load the model from a file
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

# Use the model to make predictions on new data
X_new = [[1, 2, 3, 4, 5, 6, 7, 8]]
y_pred = model.predict(X_new)
print(y_pred)

[178.03]


The output 178.03 is the prediction made by the model for the target variable (total sales) based on the input features provided in X_new.

In this case, X_new is a 2D array with a single row of data, which corresponds to a single example or observation in the dataset. The values in the array represent the values of the input features for that example.

For example, if the input features in X_new represent the retailer, the region, the state, the city, the product, the price per unit, the units sold, and the sales method, then the model's prediction of 175.42 for total sales would be based on these input features.

It is important to note that the prediction made by the model is based on the data and assumptions made during the training process, and may not necessarily reflect the true value of the target variable for that example. The accuracy of the model's prediction will depend on the quality of the data and the complexity of the problem.