In [98]:
import numpy as np
import random
import pandas as pd

df = pd.read_csv("SalesPrediction.csv")


***1. Linear Regression***

In [16]:
X_data = np.array(df.iloc[:,:-1])
y_target = np.array(df.iloc[:,-1])

In [17]:
class CustomLinearRegression:
    def __init__(self, X_data, y_target, learning_rate = 0.01, num_epochs = 10000):
        self.num_samples = X_data.shape[0]
        self.X_data = np.c_[np.ones((self.num_samples,1)),X_data]
        self.y_target = y_target
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs

        # Initial weights
        self.theta = np.random.rand(self.X_data.shape[1],1)
        self.losses = []

    def compute_loss(self, y_pred, y_target):
        loss = np.multiply((y_pred - y_target),(y_pred - y_target)).mean()
        return loss
    
    def predict(self, X_data):
        y_pred = X_data.dot(self.theta)
        return y_pred
    
    def fit(self):
        for epoch in range(self.num_epochs):
            # predict
            y_pred = self.predict(self.X_data)

            # compute loss
            loss = self.compute_loss(y_pred, self.y_target)
            self.losses.append(loss)

            # compute gradients
            loss_grd = 2*(y_pred-self.y_target)/self.num_samples
            gradients = self.X_data.T.dot(loss_grd)

            # update gradients
            self.theta = self.theta - self.learning_rate*gradients

            if (epoch%50) == 0:
                print(f'Epoch: {epoch} - Loss: {loss}')
            
        return {
            'loss': sum(self.losses)/len(self.losses),
            'weight': self.theta
        }

In [None]:
# Question 1: A
# Question 2: C
# Question 3: D

In [22]:
def r2score(y_pred, y):
    rss = np.sum( (y_pred-y)**2 )
    tss = np.sum( (y-y.mean())**2 )
    r2 = 1 - (rss/tss)
    return r2

# Question 4: B
# Case 1:
y_pred = np.array([1,2,3,4,5])
y = np.array([1,2,3,4,5])
print('Case 1: ', r2score(y_pred,y))

# Case 2:
y_pred = np.array([1,2,3,4,5])
y = np.array([3,5,5,2,4])
print('Case 2: ', r2score(y_pred,y))


Case 1:  1.0
Case 2:  -2.235294117647059


In [None]:
# Question 5: B
# Question 6: B

***2. Polynomial Regression:***

In [39]:
def create_polynomial_features (X , degree=2):
    """ Creates the polynomial features
    Args :
    X : A array tensor for the data.
    degree : A integer for the degree of the generated polynomial function.
    """
    X_new = X
    for i in range(2, degree+1):
        X_new = np.c_[X_new, X**i]
    return X_new

X = np.array([[1],[2],[3]])
create_polynomial_features(X, 3)

array([[ 1,  1,  1],
       [ 2,  4,  8],
       [ 3,  9, 27]])

In [None]:
# Question 7: A
# Question 8: A

In [96]:
def create_polynomial_features (X , degree=2):
    """ Creates the polynomial features
    Args :
    X : A array tensor for the data.
    degree : A integer for the degree of the generated polynomial function.
    """
    X_mem = []
    for X_sub in X.T:
        X_new = X_sub
        for d in range(2, degree+1):
            X_new = np.c_[X_new, np.power(X_sub,d)]
        X_mem.extend(X_new.T)
    return np.c_[X_mem].T

X = np.array([[1,2],[2,3],[3,4]])
create_polynomial_features(X, 3)

array([[ 1,  1,  1,  2,  4,  8],
       [ 2,  4,  8,  3,  9, 27],
       [ 3,  9, 27,  4, 16, 64]])

In [101]:
# Question 9: A
df = pd.get_dummies(df)

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TV                4562 non-null   float64
 1   Radio             4568 non-null   float64
 2   Social Media      4566 non-null   float64
 3   Sales             4566 non-null   float64
 4   Influencer_Macro  4572 non-null   bool   
 5   Influencer_Mega   4572 non-null   bool   
 6   Influencer_Micro  4572 non-null   bool   
 7   Influencer_Nano   4572 non-null   bool   
dtypes: bool(4), float64(4)
memory usage: 160.9 KB


In [103]:
# Handle null values
df = df.fillna(df.mean())

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TV                4572 non-null   float64
 1   Radio             4572 non-null   float64
 2   Social Media      4572 non-null   float64
 3   Sales             4572 non-null   float64
 4   Influencer_Macro  4572 non-null   bool   
 5   Influencer_Mega   4572 non-null   bool   
 6   Influencer_Micro  4572 non-null   bool   
 7   Influencer_Nano   4572 non-null   bool   
dtypes: bool(4), float64(4)
memory usage: 160.9 KB


In [122]:
# Get features
X = df.iloc[:,[0,1,2,4,5,6,7]]
y = df[['Sales']]

# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=0)

In [135]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_train_processed = scalar.fit_transform(X_train)
X_test_processed = scalar.transform(X_test)

In [128]:
scalar.mean_[0]
# Question 10: A

54.173577723283785

In [144]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2)

X_train_poly = poly_features.fit_transform(X_train_processed)
X_test_poly = poly_features.transform(X_test_processed)

In [148]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

preds = poly_model.predict(X_test_poly)
r2_score(y_test, preds)


0.9951771444754912