# Multiple Linear Regression on Toy Sales dataset

## Importing the libraries

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

To make things simple, we usually structure the dataset so that the target variable column is the last column in the table

*  **X typically denotes the feature variables**, which are all the columns in the table except the last one
* **y typically denotes the single target variable**, which is the last column in the table

In [12]:
df = pd.read_csv('Toy-Sales.csv')
df.index = df.index + 1 # Start index from 1 instead of 0, just to make it easier to interpret data
X = df.iloc[  :  , :-1] # Accesses all columns except the last
y = df.iloc[  :  , -1]  # Accesses the last column

In [13]:
print ("Feature variables of the entire dataset")
print(X)
print (type(X))

Feature variables of the entire dataset
    Price  Adexp  Promexp
1    8.75  50.04    61.13
2    8.99  50.74    60.19
3    7.50  50.14    59.16
4    7.25  50.27    60.38
5    7.40  51.25    59.71
6    8.50  50.65    59.88
7    8.40  50.87    60.14
8    7.90  50.15    60.08
9    7.25  48.24    59.90
10   8.70  50.19    59.68
11   8.40  51.11    59.83
12   8.10  51.49    59.77
13   8.40  50.10    59.29
14   7.40  49.24    60.40
15   8.00  50.04    59.89
16   8.30  49.46    60.06
17   8.10  51.62    60.51
18   8.20  49.78    58.93
19   8.99  48.60    60.09
20   7.99  49.00    61.00
21   8.50  48.00    59.00
22   7.90  54.00    59.50
23   7.99  48.70    58.00
24   8.25  50.00    60.50
<class 'pandas.core.frame.DataFrame'>


In [14]:
print ("Target variable of the entire dataset")
print (y)
print (type(y))

Target variable of the entire dataset
1     73959
2     71544
3     78587
4     80364
5     78771
6     71986
7     74885
8     73345
9     76659
10    71880
11    73598
12    74893
13    69003
14    78542
15    72543
16    74247
17    76253
18    72582
19    69022
20    76200
21    69701
22    77005
23    70987
24    75643
Name: UnitSales, dtype: int64
<class 'pandas.core.series.Series'>


## Training the Multiple Linear Regression model on the entire dataset

Normally, we will do a train test split, but here to keep things simple, we will just immediately train the linear regression model using the entire dataset


In [15]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X, y)

## The model parameters are the coefficients and the intercept

#### Determining the coefficients 

In [16]:
coef_array = regressor.coef_
print ("Coefficient for the first feature variable: Price($) is ",coef_array[0])
print ("Coefficient for the second feature variable: AdExp ($000) is ",coef_array[1])
print ("Coefficient for the third feature variable: PromExp ($000) is ",coef_array[2])


Coefficient for the first feature variable: Price($) is  -5055.26986592085
Coefficient for the second feature variable: AdExp ($000) is  648.6121402597224
Coefficient for the third feature variable: PromExp ($000) is  1802.6109561246012


#### Determining the intercept

In [17]:
print ("Intercept for the regression model is ", regressor.intercept_)

Intercept for the regression model is  -25096.83292187017


## Generating predictions for 3 different scenarios

Each scenario involves a different combination of values for the feature variables

In [18]:
scenarios = [
    [9.10, 52.00, 61.00], # Scenario 1
    [7.10, 48.00, 57.00], # Scenario 2
    [8.10, 50.00, 60.00]  # Scenario 3
]


# As the model was initially trained using a dataframe with column labels
# we also need to create a dataframe with the same column labes for input to the predict method

scenarios_df = pd.DataFrame(scenarios, columns=['Price', 'Adexp', 'Promexp'])
scenarios_df.index = scenarios_df.index + 1 # increment the index, not necessary, just make it a bit more readable
scenarios_df



Unnamed: 0,Price,Adexp,Promexp
1,9.1,52.0,61.0
2,7.1,48.0,57.0
3,8.1,50.0,60.0


In [19]:

# An alternative way of creating the Dataframe with the same set of values 
# for the 3 different scenarios

scenarios_df = pd.DataFrame({

    'Price': [9.1, 7.1, 8.1],
    'Adexp': [52.0, 48.0, 50.0],
    'Promexp': [61.0, 57.0, 60.0],

})
scenarios_df.index = scenarios_df.index + 1 # increment the index, not necessary, just make it a bit more readable
scenarios_df
    

Unnamed: 0,Price,Adexp,Promexp
1,9.1,52.0,61.0
2,7.1,48.0,57.0
3,8.1,50.0,60.0


In [20]:
scenario_pred = regressor.predict(scenarios_df)
for index, prediction in enumerate(scenario_pred):
    print (f"Prediction for scenario {index+1} is {prediction}")


Prediction for scenario 1 is 72587.31091535633
Prediction for scenario 2 is 72892.95826166074
Prediction for scenario 3 is 74542.74554463314
