In [49]:
import pandas as pd

import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

ad = pd.read_csv("../input/advertissing/Advertising.csv", usecols = [1,2,3,4])

df =  ad.copy()
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [34]:
X = df.drop("sales", axis=1) # we choose independent variables

y = df["sales"]  # we took our dependent variable

# divided our dataset into test and training
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size =0.20, random_state =42)


In [35]:
X_train.shape

(160, 3)

In [36]:
y_train.shape

(160,)

In [37]:
X_test.shape

(40, 3)

In [38]:
y_test.shape

(40,)

In [39]:
training = df.copy() 
training.shape

(200, 4)

## Stats Models

In [40]:
lm = sm.OLS(y_train,X_train) # create model in train dataset

In [41]:
model = lm.fit() # fitted model
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared (uncentered):,0.982
Model:,OLS,Adj. R-squared (uncentered):,0.982
Method:,Least Squares,F-statistic:,2935.0
Date:,"Sat, 24 Apr 2021",Prob (F-statistic):,1.28e-137
Time:,14:49:26,Log-Likelihood:,-336.65
No. Observations:,160,AIC:,679.3
Df Residuals:,157,BIC:,688.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040

0,1,2,3
Omnibus:,11.405,Durbin-Watson:,1.895
Prob(Omnibus):,0.003,Jarque-Bera (JB):,15.574
Skew:,-0.432,Prob(JB):,0.000415
Kurtosis:,4.261,Cond. No.,13.5


In [42]:
model.summary().tables[1]
# we have coefficient each independent variable

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040


 ## Scikit-learn model

In [43]:

lm = LinearRegression()
model = lm.fit(X_train, y_train)

In [44]:
model.intercept_ # constant coefficent 

2.979067338122629

In [45]:
model.coef_ # ıt gives us in order TV , radio and newspaper coefficient

array([0.04472952, 0.18919505, 0.00276111])

### Prediction 

Model Equation:

Sales = 2.97 + TV0.04 + radio0.18 + newspaper*0.002

30 units of TV expenditure, 10 units of TV expenditure, 40 units of TV expenditure

In [46]:

new_data = [[30], [10], [40]]
new_data = pd.DataFrame(new_data).T

In [47]:
model.predict(new_data) # predicting y values according to given new_data

array([6.32334798])

In [53]:
rmse_train = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
# we find the train error

In [54]:
rmse

1.644727765644337

In [55]:
rmse_rest = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

In [56]:
rmse_rest

1.78159966153345

### Model Tunnig

In [58]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [61]:
X = df.drop('sales', axis=1)
y = df["sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=144)
# A certain part of the model will be selected, but we do not know which part will be chosen. we are going to use cross validation
lm = LinearRegression() 
model = lm.fit(X_train, y_train)

In [62]:
np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
# train error

1.6748559274650712

In [63]:
np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
#test error

1.6640263686701031

In [64]:
model.score(X_train, y_train)
# R-squared

0.8971614078663419

In [70]:
cross_val_score(model, X_train, y_train, cv =10, scoring = "r2").mean()
# We cross validated our model R-squared value by dividing it into 10 layers

0.8733783298422942

In [71]:
np.sqrt(-cross_val_score(model, 
                X_train, 
                y_train, 
                cv =10, 
                scoring = "neg_mean_squared_error")).mean()
# We cross validated our model  train error by dividing it into 10 layers

1.6649345607872932

In [72]:
np.sqrt(-cross_val_score(model, 
                X_test, 
                y_test, 
                cv =10, 
                scoring = "neg_mean_squared_error")).mean()
 # We cross validated our model  test error by dividing it into 10 layers

1.7399924960346644