# Sales Prediction - Taniya Naskar

In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load

In [2]:
df = pd.read_csv('Advertising.csv')
df

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
...,...,...,...,...,...
195,196,38.2,3.7,13.8,7.6
196,197,94.2,4.9,8.1,9.7
197,198,177.0,9.3,6.4,12.8
198,199,283.6,42.0,66.2,25.5


In [3]:
df.drop(columns=['Unnamed: 0',], inplace=True)

In [4]:
df

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


# Split the data 

In [5]:
features = df.drop(columns=['Sales'])
target = df['Sales']
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=33)

# Model

In [19]:
model = LinearRegression()
model.fit(features_train, target_train)

In [20]:
ypred = model.predict(features_test)
mse = mean_squared_error(target_test, ypred)
r2 = r2_score(target_test, ypred)

In [21]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2*100:.2f}%")

Mean Squared Error: 2.321980700384382
R-squared: 89.37%


# Cross Validation

In [23]:
num_folds = 5


kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
mse_scores = []
r2_scores = []

for train_index, test_index in kfold.split(features):
    xtrain, xtest = features.iloc[train_index], features.iloc[test_index]
    ytrain, ytest = target[train_index], target[test_index]

    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    y_pred = model.predict(xtest)
    mse = mean_squared_error(ytest, y_pred)
    r2 = model.score(xtest, ytest)
    
    mse_scores.append(mse)
    r2_scores.append(r2)

avg_mse = np.mean(mse_scores)
avg_r2 = np.mean(r2_scores)

In [25]:
print(f"Average MSE: {avg_mse}")
print(f"Average R-squared: {avg_r2*100: .2f}%")

Average MSE: 2.9650878042681628
Average R-squared:  88.27%
