## Multiple Linear Regression

In [2]:
import pandas as pd
ad = pd.read_csv("Advertising.csv", usecols = [1,2,3,4])
df = ad.copy()
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [3]:
X = df.drop("sales", axis = 1)

In [4]:
X[:10]

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
5,8.7,48.9,75.0
6,57.5,32.8,23.5
7,120.2,19.6,11.6
8,8.6,2.1,1.0
9,199.8,2.6,21.2


In [5]:
y = df["sales"]

In [6]:
y

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: sales, Length: 200, dtype: float64

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [9]:
X_train.shape

(160, 3)

In [10]:
y_train.shape

(160,)

In [11]:
X_test.shape

(40, 3)

In [12]:
y_test.shape

(40,)

In [13]:
training = df.copy()

In [14]:
training.shape

(200, 4)

### Statsmodels

In [16]:
import statsmodels.api as sm

In [19]:
lm = sm.OLS(y_train, X_train)

In [20]:
model = lm.fit()
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared (uncentered):,0.982
Model:,OLS,Adj. R-squared (uncentered):,0.982
Method:,Least Squares,F-statistic:,2935.0
Date:,"Sun, 25 Jun 2023",Prob (F-statistic):,1.28e-137
Time:,21:43:30,Log-Likelihood:,-336.65
No. Observations:,160,AIC:,679.3
Df Residuals:,157,BIC:,688.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040

0,1,2,3
Omnibus:,11.405,Durbin-Watson:,1.895
Prob(Omnibus):,0.003,Jarque-Bera (JB):,15.574
Skew:,-0.432,Prob(JB):,0.000415
Kurtosis:,4.261,Cond. No.,13.5


In [21]:
model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040


## scikit-learn model 

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)

In [25]:
model.intercept_

2.979067338122631

In [26]:
model.coef_

array([0.04472952, 0.18919505, 0.00276111])

# Guess

## Model Equation

Sales = 2.97 + TV x 0.04 + radio x 0.18 + newspaper x 0.002

For example, if you spend 30 units on TV, 10 units on radio, 40 units on newspaper, what is the estimated value of sales?

In [27]:
new_data = [[30],[10],[40]]

In [29]:
new_data = pd.DataFrame(new_data).T

In [30]:
model.predict(new_data)



array([6.32334798])

In [32]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

In [39]:
rmse_train = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

In [40]:
rmse_train

1.6447277656443373

In [41]:
rmse_test = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

In [42]:
rmse_test

1.7815996615334497

# Model Tuning

In [43]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [44]:
X = df.drop('sales', axis = 1)

In [45]:
y = df["sales"]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.20,
                                                    random_state = 99)

In [52]:
lm = LinearRegression()

In [53]:
model = lm.fit(X_train, y_train)

In [54]:
np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

1.723682482265075

In [67]:
np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

1.4312783138301637

In [55]:
model.score(X_train, y_train)

0.890628886292566

In [56]:
cross_val_score(model, X, y, cv = 10, scoring = "r2")

array([0.87302696, 0.8581613 , 0.92968723, 0.89013272, 0.93146498,
       0.93138735, 0.7597901 , 0.91217097, 0.83891753, 0.92882311])

In [57]:
cross_val_score(model, X, y, cv = 10, scoring = "r2").mean()

0.8853562237979616

In [59]:
cross_val_score(model, X_train, y_train, cv = 10, scoring = "r2")

array([0.91245644, 0.92453248, 0.85814127, 0.92005307, 0.8783077 ,
       0.93204044, 0.87656032, 0.5495448 , 0.90023107, 0.79649734])

In [60]:
cross_val_score(model, X_train, y_train, cv = 10, scoring = "r2").mean()

0.8548364931257495

In [65]:
np.sqrt(-cross_val_score(model,
                X_train,
                y_train,
                cv = 10,
                scoring = "neg_mean_squared_error")).mean()

1.774545157598883

In [66]:
np.sqrt(-cross_val_score(model,
                X_test,
                y_test,
                cv = 10,
                scoring = "neg_mean_squared_error")).mean()

1.475233977312039