In [133]:
from linear_regression_model import CustomLinearRegression
from utils import *

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [134]:
df = pd.read_csv('data/SalesPrediction.csv')
df

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.913410,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.246340
4,15.0,8.437408,1.405998,Micro,56.594181
...,...,...,...,...,...
4567,26.0,4.472360,0.717090,Micro,94.685866
4568,71.0,20.610685,6.545573,Nano,249.101915
4569,44.0,19.800072,5.096192,Micro,163.631457
4570,71.0,17.534640,1.940873,Macro,253.610411


In [135]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TV            4562 non-null   float64
 1   Radio         4568 non-null   float64
 2   Social Media  4566 non-null   float64
 3   Influencer    4572 non-null   object 
 4   Sales         4566 non-null   float64
dtypes: float64(4), object(1)
memory usage: 178.7+ KB


In [136]:
df.describe()

Unnamed: 0,TV,Radio,Social Media,Sales
count,4562.0,4568.0,4566.0,4566.0
mean,54.066857,18.160356,3.323956,192.466602
std,26.125054,9.676958,2.21267,93.133092
min,10.0,0.000684,3.1e-05,31.199409
25%,32.0,10.525957,1.527849,112.322882
50%,53.0,17.859513,3.055565,189.231172
75%,77.0,25.64973,4.807558,272.507922
max,100.0,48.871161,13.981662,364.079751


In [137]:
df = pd.get_dummies(df)
df

Unnamed: 0,TV,Radio,Social Media,Sales,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano
0,16.0,6.566231,2.907983,54.732757,False,True,False,False
1,13.0,9.237765,2.409567,46.677897,False,True,False,False
2,41.0,15.886446,2.913410,150.177829,False,True,False,False
3,83.0,30.020028,6.922304,298.246340,False,True,False,False
4,15.0,8.437408,1.405998,56.594181,False,False,True,False
...,...,...,...,...,...,...,...,...
4567,26.0,4.472360,0.717090,94.685866,False,False,True,False
4568,71.0,20.610685,6.545573,249.101915,False,False,False,True
4569,44.0,19.800072,5.096192,163.631457,False,False,True,False
4570,71.0,17.534640,1.940873,253.610411,True,False,False,False


In [138]:
df = df.fillna(0)
df.isnull().sum()

TV                  0
Radio               0
Social Media        0
Sales               0
Influencer_Macro    0
Influencer_Mega     0
Influencer_Micro    0
Influencer_Nano     0
dtype: int64

In [139]:
df = df.fillna(df.mean())
df.isnull().sum()

TV                  0
Radio               0
Social Media        0
Sales               0
Influencer_Macro    0
Influencer_Mega     0
Influencer_Micro    0
Influencer_Nano     0
dtype: int64

In [140]:
df[['TV', 'Radio', 'Social Media', 'Sales']].corr()

Unnamed: 0,TV,Radio,Social Media,Sales
TV,1.0,0.860518,0.522565,0.98857
Radio,0.860518,1.0,0.60445,0.86379
Social Media,0.522565,0.60445,1.0,0.526777
Sales,0.98857,0.86379,0.526777,1.0


### Define features and target

In [141]:
X_features = df[['TV', 'Radio', 'Social Media', 'Influencer_Macro', 'Influencer_Mega', 
        'Influencer_Micro', 'Influencer_Nano']]
y_sales = df[['Sales']]

X_train, X_test, y_train, y_test = train_test_split(X_features, y_sales, test_size=0.33, random_state=0)

In [142]:
X_train.dtypes, y_train.dtypes

(TV                  float64
 Radio               float64
 Social Media        float64
 Influencer_Macro       bool
 Influencer_Mega        bool
 Influencer_Micro       bool
 Influencer_Nano        bool
 dtype: object,
 Sales    float64
 dtype: object)

In [143]:
y_train.shape

(3063, 1)

In [144]:
scaler = StandardScaler()
X_train_preprocessed = scaler.fit_transform(X_train)
X_test_preprocessed = scaler.transform(X_test)
scaler.mean_

array([53.9970617 , 18.22209011,  3.33487105,  0.24779628,  0.25138753,
        0.25008162,  0.25073457])

### Test R2 Score

In [145]:
y_pred = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 3, 4, 5])

r2_score(y_pred, y)

np.float64(1.0)

In [146]:
y = np.array([3, 5, 5, 2, 4])
r2_score(y_pred, y)

np.float64(-1.2000000000000002)

### Train Linear Regression

In [147]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train_preprocessed, y_train)

In [148]:
y_pred = lr_model.predict(X_test_preprocessed)
r2_score(y_pred, y_test)

np.float64(0.9815459611095066)

### Create polynomial features

In [149]:
X = np.array([[1, 2], [2, 3], [3, 4]])
degree = 2

X_poly = create_polynomial_features(X, degree)
X_poly

array([[ 1,  1,  2,  4],
       [ 2,  4,  3,  9],
       [ 3,  9,  4, 16]])

### Train Polynomial Regression

In [150]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train_preprocessed)
X_test_poly = poly_features.transform(X_test_preprocessed)

In [151]:
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

preds = poly_model.predict(X_test_poly)
r2_score(preds, y_test)

np.float64(0.9781759375214123)

### Train Custom Polynomial Regression

In [152]:
X_train_preprocessed.shape

(3063, 7)

In [153]:
X_train_preprocessed[:, 2:3]

array([[-0.17117575],
       [-1.47454833],
       [-0.55726535],
       ...,
       [ 0.58703816],
       [-1.22457248],
       [-1.04684805]])

In [154]:
X_train_poly = create_polynomial_features(X_train_preprocessed[:, 2:3], degree=2)
X_train_poly

array([[-0.17117575,  0.02930114],
       [-1.47454833,  2.17429276],
       [-0.55726535,  0.31054467],
       ...,
       [ 0.58703816,  0.3446138 ],
       [-1.22457248,  1.49957777],
       [-1.04684805,  1.09589085]])

In [155]:
X_test_poly = create_polynomial_features(X_test_preprocessed[:, 2:3], degree=2)
X_test_poly = np.hstack((X_test_preprocessed, X_test_poly[:, 1:]))
X_test_poly

array([[-0.34283858, -0.11361891, -0.84351677, ...,  1.73167391,
        -0.57848122,  0.71152054],
       [ 0.76222428,  1.17276695, -0.45136527, ..., -0.57747593,
         1.72866459,  0.20373061],
       [ 1.14328044,  1.04152694,  1.06570814, ..., -0.57747593,
        -0.57848122,  1.13573384],
       ...,
       [ 0.95275236,  1.11773825,  1.06866838, ..., -0.57747593,
        -0.57848122,  1.14205211],
       [-0.41904981, -0.32445517,  1.20063234, ..., -0.57747593,
        -0.57848122,  1.44151802],
       [-0.91442282, -1.25598448, -0.32806086, ..., -0.57747593,
        -0.57848122,  0.10762393]])

In [156]:
X_train_preprocessed.shape, y_train.shape

((3063, 7), (3063, 1))

In [157]:
custom_poly_model = CustomLinearRegression(X_train_preprocessed, y_train, lr=0.1, epochs=300)
custom_poly_model.fit()

Epoch: 0 - Loss: 46095.039002692974
Epoch: 50 - Loss: 236.18208262026488
Epoch: 100 - Loss: 213.7285858789419
Epoch: 150 - Loss: 212.25027120594578
Epoch: 200 - Loss: 212.152878500321
Epoch: 250 - Loss: 212.14646217989534


{'loss': np.float64(629.1270171218272),
 'weight': array([[ 1.92762097e+02],
        [ 8.60392588e+01],
        [ 6.88768452e+00],
        [ 2.13059687e-01],
        [ 1.27537193e-01],
        [ 5.91277063e-02],
        [ 1.06167699e-02],
        [-2.07845438e-01]])}

In [158]:
preds = custom_poly_model.predict(X_test_preprocessed)
r2_score(preds, y_test)

np.float64(0.9815410449296709)

In [159]:
print(np.__version__)

2.1.2
