In [2]:
import pandas as pd
advertising = pd.read_csv('Advertising.csv', usecols=[1, 2, 3, 4])
df = advertising.copy()
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [4]:
X = df.drop('sales', axis=1)
y = df['sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train.shape

(160, 3)

In [6]:
y_train.shape

(160,)

In [7]:
training = df.copy()

### Modelling with statsmodel

In [8]:
import statsmodels.api as sm;

In [9]:
lm = sm.OLS(y_train, X_train)

In [10]:
model = lm.fit()

In [11]:
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.982
Model:,OLS,Adj. R-squared:,0.982
Method:,Least Squares,F-statistic:,2935.0
Date:,"Wed, 17 Jul 2019",Prob (F-statistic):,1.28e-137
Time:,23:32:33,Log-Likelihood:,-336.65
No. Observations:,160,AIC:,679.3
Df Residuals:,157,BIC:,688.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040

0,1,2,3
Omnibus:,11.405,Durbin-Watson:,1.895
Prob(Omnibus):,0.003,Jarque-Bera (JB):,15.574
Skew:,-0.432,Prob(JB):,0.000415
Kurtosis:,4.261,Cond. No.,13.5


In [12]:
model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040


### Modelling with sklearn

In [13]:
from sklearn.linear_model import LinearRegression;

In [14]:
lm = LinearRegression()
model_sk = lm.fit(X_train, y_train)

In [15]:
model_sk.intercept_

2.979067338122629

In [16]:
model_sk.coef_

array([0.04472952, 0.18919505, 0.00276111])

### Prediction
#### Prediction for 30 units of TV, 10 units of radio and 40 units of newspaper
#### Equation for sales from sklearn' model => *** sales = 2.979067338122629 + TV * 0.04472952 + radio * 0.18919505 + newpaper * 0.00276111***

In [17]:
prediction_data = pd.DataFrame([[30], [10], [40]]).T

In [18]:
prediction_data

Unnamed: 0,0,1,2
0,30,10,40


In [19]:
model_sk.predict(prediction_data)

array([6.32334798])

In [20]:
import numpy as np
from sklearn.metrics import mean_squared_error

In [21]:
rmse = np.sqrt(mean_squared_error(y_train, model_sk.predict(X_train)))

In [22]:
rmse

1.644727765644337

In [23]:
rmse_test = np.sqrt(mean_squared_error(y_test, model_sk.predict(X_test)))

In [24]:
rmse_test

1.7815996615334502

## Model tuning

In [60]:
cross_val_score(model_sk, X, y, cv=10, scoring='r2')

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [26]:
-cross_val_score(model_sk, X, y, cv=10, scoring='neg_mean_squared_error')

array([3.56038438, 3.29767522, 2.08943356, 2.82474283, 1.3027754 ,
       1.74163618, 8.17338214, 2.11409746, 3.04273109, 2.45281793])

In [27]:
print('Validated train model error', np.sqrt(-cross_val_score(
    model_sk,
    X_train,
    y_train,
    cv=10,
    scoring='neg_mean_squared_error'
)).mean())

Validated train model error 1.6513523730313335


In [28]:
print('Validated test model error', np.sqrt(-cross_val_score(
    model_sk,
    X_test,
    y_test,
    cv=10,
    scoring='neg_mean_squared_error'
)).mean())

Validated test model error 1.8462778823997095


## PCR (Principal component regression)

In [29]:
hitters = pd.read_csv('Hitters.csv')
df = hitters.copy()

In [30]:
df.dropna(inplace=True)

In [31]:
df.shape

(263, 20)

In [32]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AtBat,263.0,403.642586,147.307209,19.0,282.5,413.0,526.0,687.0
Hits,263.0,107.828897,45.125326,1.0,71.5,103.0,141.5,238.0
HmRun,263.0,11.619772,8.757108,0.0,5.0,9.0,18.0,40.0
Runs,263.0,54.745247,25.539816,0.0,33.5,52.0,73.0,130.0
RBI,263.0,51.486692,25.882714,0.0,30.0,47.0,71.0,121.0
Walks,263.0,41.114068,21.718056,0.0,23.0,37.0,57.0,105.0
Years,263.0,7.311787,4.793616,1.0,4.0,6.0,10.0,24.0
CAtBat,263.0,2657.543726,2286.582929,19.0,842.5,1931.0,3890.5,14053.0
CHits,263.0,722.186312,648.199644,4.0,212.0,516.0,1054.0,4256.0
CHmRun,263.0,69.239544,82.197581,0.0,15.0,40.0,92.5,548.0


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263 entries, 1 to 321
Data columns (total 20 columns):
AtBat        263 non-null int64
Hits         263 non-null int64
HmRun        263 non-null int64
Runs         263 non-null int64
RBI          263 non-null int64
Walks        263 non-null int64
Years        263 non-null int64
CAtBat       263 non-null int64
CHits        263 non-null int64
CHmRun       263 non-null int64
CRuns        263 non-null int64
CRBI         263 non-null int64
CWalks       263 non-null int64
League       263 non-null object
Division     263 non-null object
PutOuts      263 non-null int64
Assists      263 non-null int64
Errors       263 non-null int64
Salary       263 non-null float64
NewLeague    263 non-null object
dtypes: float64(1), int64(16), object(3)
memory usage: 43.1+ KB


In [39]:
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [42]:
df[['NewLeague', 'League', 'Division']].head()

Unnamed: 0,NewLeague,League,Division
1,N,N,W
2,A,A,W
3,N,N,E
4,N,N,E
5,A,A,W


In [47]:
dummies = pd.get_dummies(df[['NewLeague', 'League', 'Division']])
dummies.head()

Unnamed: 0,NewLeague_A,NewLeague_N,League_A,League_N,Division_E,Division_W
1,0,1,0,1,0,1
2,1,0,1,0,0,1
3,0,1,0,1,1,0
4,0,1,0,1,1,0
5,1,0,1,0,0,1


In [48]:
y = df['Salary']

In [54]:
X_ = df.drop(['NewLeague', 'League', 'Division'], axis=1)
X_.head(3)

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0


In [57]:
X = pd.concat([X_ ,dummies[['NewLeague_A', 'League_A', 'Division_E']]], axis=1)

In [58]:
X.head(2)

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,NewLeague_A,League_A,Division_E
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,0,0,0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,1,1,0


### Create splitted test and train data

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [104]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [116]:
pca = PCA()

In [106]:
X_reduced_train = pca.fit_transform(scale(X_train))

In [107]:
X_reduced_train[2,:3]

array([ 1.69783368, -1.70777988, -1.67952504])

### The function call below is explanation power of variance for each variable with the variables before it.

In [108]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4) * 100)

array([ 38.72,  59.43,  69.92,  77.6 ,  82.78,  86.94,  90.42,  93.11,
        95.15,  96.55,  97.44,  98.13,  98.77,  99.24,  99.54,  99.78,
        99.92,  99.97,  99.99, 100.  ])

In [109]:
lm = LinearRegression()
pcr_model = lm.fit(X_reduced_train, y_train)

In [110]:
pcr_model.coef_

array([115.92142305, -28.91578191,  31.93922498, -52.00296864,
        95.67836544,  90.66899253,  71.51687446, -72.5930174 ,
       387.34751162, -16.36884117,  -8.08426435, -18.65583613,
       -12.51939607,  -4.70261095,  15.79271544,  -2.55643772,
       -28.62267876,  11.75082783,  -5.73121032,  -0.73038188])

In [111]:
pcr_model.intercept_

543.4834416243655

In [143]:
y_predictions = pcr_model.predict(X_reduced_train)

In [144]:
np.sqrt(mean_squared_error(y_train, y_predictions)) 

5.786435718659358e-13