In [8]:
import pandas as pd
advertising = pd.read_csv('Advertising.csv', usecols=[1, 2, 3, 4])
df = advertising.copy()
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [9]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [10]:
X = df.drop('sales', axis=1)
y = df['sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train.shape

(160, 3)

In [12]:
y_train.shape

(160,)

In [13]:
training = df.copy()

### Modelling with statsmodel

In [14]:
import statsmodels.api as sm;

In [15]:
lm = sm.OLS(y_train, X_train)

In [16]:
model = lm.fit()

In [17]:
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.982
Model:,OLS,Adj. R-squared:,0.982
Method:,Least Squares,F-statistic:,2935.0
Date:,"Wed, 17 Jul 2019",Prob (F-statistic):,1.28e-137
Time:,00:16:35,Log-Likelihood:,-336.65
No. Observations:,160,AIC:,679.3
Df Residuals:,157,BIC:,688.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040

0,1,2,3
Omnibus:,11.405,Durbin-Watson:,1.895
Prob(Omnibus):,0.003,Jarque-Bera (JB):,15.574
Skew:,-0.432,Prob(JB):,0.000415
Kurtosis:,4.261,Cond. No.,13.5


In [18]:
model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040


### Modelling with sklearn

In [19]:
from sklearn.linear_model import LinearRegression;

In [20]:
lm = LinearRegression()
model_sk = lm.fit(X_train, y_train)

In [21]:
model_sk.intercept_

2.979067338122629

In [22]:
model_sk.coef_

array([0.04472952, 0.18919505, 0.00276111])

### Prediction
#### Prediction for 30 units of TV, 10 units of radio and 40 units of newspaper
#### Equation for sales from sklearn' model => *** sales = 2.979067338122629 + TV * 0.04472952 + radio * 0.18919505 + newpaper * 0.00276111***

In [23]:
prediction_data = pd.DataFrame([[30], [10], [40]]).T

In [24]:
prediction_data

Unnamed: 0,0,1,2
0,30,10,40


In [25]:
model_sk.predict(prediction_data)

array([6.32334798])

In [26]:
import numpy as np
from sklearn.metrics import mean_squared_error

In [27]:
rmse = np.sqrt(mean_squared_error(y_train, model_sk.predict(X_train)))

In [28]:
rmse

1.644727765644337

In [29]:
rmse_test = np.sqrt(mean_squared_error(y_test, model_sk.predict(X_test)))

In [30]:
rmse_test

1.7815996615334502

## Model tuning

In [32]:
cross_val_score(model_sk, X, y, cv=10, scoring='r2')

array([0.87302696, 0.8581613 , 0.92968723, 0.89013272, 0.93146498,
       0.93138735, 0.7597901 , 0.91217097, 0.83891753, 0.92882311])

In [34]:
-cross_val_score(model_sk, X, y, cv=10, scoring='neg_mean_squared_error')

array([3.56038438, 3.29767522, 2.08943356, 2.82474283, 1.3027754 ,
       1.74163618, 8.17338214, 2.11409746, 3.04273109, 2.45281793])

In [56]:
print('Validated train model error', np.sqrt(-cross_val_score(
    model_sk,
    X_train,
    y_train,
    cv=10,
    scoring='neg_mean_squared_error'
)).mean())

Validated train model error 1.6513523730313335


In [54]:
print('Validated test model error', np.sqrt(-cross_val_score(
    model_sk,
    X_test,
    y_test,
    cv=10,
    scoring='neg_mean_squared_error'
)).mean())

Validated test model error 1.8462778823997095
