In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

In [3]:
df = pd.read_csv('./data/Dummy Data HSS.xls')
df.head()

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.91341,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.24634
4,15.0,8.437408,1.405998,Micro,56.594181


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TV            4562 non-null   float64
 1   Radio         4568 non-null   float64
 2   Social Media  4566 non-null   float64
 3   Influencer    4572 non-null   object 
 4   Sales         4566 non-null   float64
dtypes: float64(4), object(1)
memory usage: 178.7+ KB


In [5]:
df = pd.get_dummies(df) # updates the df again

# see the updated df

df

Unnamed: 0,TV,Radio,Social Media,Sales,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano
0,16.0,6.566231,2.907983,54.732757,False,True,False,False
1,13.0,9.237765,2.409567,46.677897,False,True,False,False
2,41.0,15.886446,2.913410,150.177829,False,True,False,False
3,83.0,30.020028,6.922304,298.246340,False,True,False,False
4,15.0,8.437408,1.405998,56.594181,False,False,True,False
...,...,...,...,...,...,...,...,...
4567,26.0,4.472360,0.717090,94.685866,False,False,True,False
4568,71.0,20.610685,6.545573,249.101915,False,False,False,True
4569,44.0,19.800072,5.096192,163.631457,False,False,True,False
4570,71.0,17.534640,1.940873,253.610411,True,False,False,False


In [6]:
df.columns # getting the column names

Index(['TV', 'Radio', 'Social Media', 'Sales', 'Influencer_Macro',
       'Influencer_Mega', 'Influencer_Micro', 'Influencer_Nano'],
      dtype='object')

### Now we have a dataframe that has no missing values. Now, we want to encode the categorical variable to dummy variables, as follows:

In [7]:
df = df[['TV', 'Radio', 'Social Media', 'Influencer_Macro',
       'Influencer_Mega', 'Influencer_Micro', 'Influencer_Nano', 'Sales']]

# see the updated df

df.head()

Unnamed: 0,TV,Radio,Social Media,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano,Sales
0,16.0,6.566231,2.907983,False,True,False,False,54.732757
1,13.0,9.237765,2.409567,False,True,False,False,46.677897
2,41.0,15.886446,2.91341,False,True,False,False,150.177829
3,83.0,30.020028,6.922304,False,True,False,False,298.24634
4,15.0,8.437408,1.405998,False,False,True,False,56.594181




### Out of curiosity, we just want to explore whether there is any correlation of Sales with its predictors:


In [8]:
df.corr()

Unnamed: 0,TV,Radio,Social Media,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano,Sales
TV,1.0,0.86946,0.528168,0.021375,-0.012642,-0.004868,-0.003648,0.999497
Radio,0.86946,1.0,0.607452,0.009523,-0.005072,0.004215,-0.008605,0.869105
Social Media,0.528168,0.607452,1.0,0.011644,0.013083,-0.013315,-0.011357,0.528906
Influencer_Macro,0.021375,0.009523,0.011644,1.0,-0.332131,-0.331171,-0.328482,0.019277
Influencer_Mega,-0.012642,-0.005072,0.013083,-0.332131,1.0,-0.338211,-0.335465,-0.01171
Influencer_Micro,-0.004868,0.004215,-0.013315,-0.331171,-0.338211,1.0,-0.334495,-0.004101
Influencer_Nano,-0.003648,-0.008605,-0.011357,-0.328482,-0.335465,-0.334495,1.0,-0.003292
Sales,0.999497,0.869105,0.528906,0.019277,-0.01171,-0.004101,-0.003292,1.0


### Now we have a clean data. Now, we will create the independent and dependent variables (x and y), as follows:

In [9]:
x = df.iloc[:,0:-1].values
y = df.iloc[:,-1:].values




Let's see some preview of x and y:


In [10]:
print(x)
print(y)


[[16.0 6.566230788 2.907982773 ... True False False]
 [13.0 9.237764567 2.409567204 ... True False False]
 [41.0 15.88644602 2.913410175 ... True False False]
 ...
 [44.0 19.80007236 5.096191875 ... False True False]
 [71.0 17.5346403 1.94087322 ... False False False]
 [42.0 15.96668752 5.046547629 ... False True False]]
[[ 54.73275715]
 [ 46.67789698]
 [150.1778288 ]
 ...
 [163.6314574 ]
 [253.6104113 ]
 [148.2024141 ]]




Looks good. Now, let's split our data for training and testing:


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y)



Let's see some previews:


In [12]:
print(x_train)
print(y_train)

[[66.0 34.39442709 3.764872969 ... True False False]
 [62.0 24.31903081 0.722076955 ... False False True]
 [65.0 24.23537069 3.319695807 ... False False False]
 ...
 [39.0 16.43039265 0.629930468 ... False True False]
 [17.0 1.161547765 0.126207259 ... False False True]
 [85.0 29.82247184 7.304243425 ... False False True]]
[[240.8906045 ]
 [220.6699758 ]
 [233.6433429 ]
 ...
 [140.2820132 ]
 [ 60.62366983]
 [303.2443019 ]]


We want to know the number of data used for training and testing, as follows:

In [13]:
print(len(x_train),len(x_test))

3429 1143



### Analysis¶

Now we will train and predict the data based on several regression models:


- **Régression Linéaire (Linear Regression)** : Un algorithme de régression simple qui tente de modéliser la relation entre une variable dépendante et une ou plusieurs variables indépendantes en ajustant une ligne droite à travers les données.

- **Forêt Aléatoire (Random Forest)** : Un ensemble de méthodes d'apprentissage basées sur des arbres de décision, où plusieurs arbres sont construits pour améliorer la précision et réduire le surapprentissage.

- **Arbre de Décision (Decision Tree)** : Un modèle d'apprentissage supervisé qui divise les données en branches basées sur des critères décisionnels pour effectuer une prédiction ou une classification.

- **Machine à Vecteurs de Support (Support Vector Machine)** : Un algorithme utilisé pour les tâches de classification qui trouve l'hyperplan optimal séparant les différentes classes de manière maximale.

- **Régression Polynomiale (Polynomial Regression)** : Un algorithme de régression qui modélise la relation entre la variable dépendante et les variables indépendantes sous forme d'équation polynomiale.


For each regression model, we will evaluate its r2_score and root mean squared error (RMSE). The higher r2_score the better; the lower RMSE, the better.
### Linear Regression

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error as mse

In [22]:
from sklearn.impute import SimpleImputer

# Impute missing values in x_train and y_train with the mean of the respective columns
imputer_x = SimpleImputer(strategy='mean')
imputer_y = SimpleImputer(strategy='mean')

# Fit and transform x_train and y_train
x_train_imputed = imputer_x.fit_transform(x_train)
y_train_imputed = imputer_y.fit_transform(y_train)

# Now, fit the model with the imputed data
lr_regressor = LinearRegression()
lr_regressor.fit(x_train_imputed, y_train_imputed)


In [26]:
# Impute missing values in x_test with the mean of the respective columns
x_test_imputed = imputer_x.transform(x_test)  # Use the same imputer as for x_train
# Fit the model and predict using x_test_imputed
svr_regressor = SVR()
svr_regressor.fit(x_train_imputed, y_train_imputed)
# Perform prediction
y_pred_svr = svr_regressor.predict(x_test_imputed)
# Evaluate the model
r2_svr = r2_score(y_test, y_pred_svr)
rmse_svr = mse(y_test, y_pred_svr) ** 0.5
print(f"SVR R2 score: {r2_svr}")
print(f"SVR RMSE: {rmse_svr}")


  y = column_or_1d(y, warn=True)


SVR R2 score: 0.9964567667778762
SVR RMSE: 5.501433871929206


In [27]:
# Print the coefficients and intercept of the linear regression model
print("Coefficients: ", lr_regressor.coef_)
print("Intercept: ", lr_regressor.intercept_)


Coefficients:  [[ 3.49833129  0.16198314  0.08503987 -0.46895771  0.18783963  0.06181933
   0.21929875]]
Intercept:  [0.09832431]


In [28]:
df.columns

Index(['TV', 'Radio', 'Social Media', 'Influencer_Macro', 'Influencer_Mega',
       'Influencer_Micro', 'Influencer_Nano', 'Sales'],
      dtype='object')

In [29]:
print(lr_regressor.coef_, lr_regressor.intercept_)

[[ 3.49833129  0.16198314  0.08503987 -0.46895771  0.18783963  0.06181933
   0.21929875]] [0.09832431]


Random Forest Regression