In [1]:
# Main libraries
import pandas as pd
import numpy as np

# Machine Learning libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split

# Preprocessig libraries
from sklearn.preprocessing import MinMaxScaler

# Visual libraries
import plotly.graph_objs as go

# Statistical library
import statsmodels.api as sm

from statistics import stdev

# Upload data library
from google.colab import drive


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [None]:
# Mount my personal drive
drive.mount('/content/drive')
#drive.flush_and_unmount

In [None]:
# Load the data
adv = pd.read_csv('drive/MyDrive/Aulas do Doutorado/Ciencia de dados/dados_erico/advertising.csv')

print(adv.shape)
adv.head()

# sales (in thousands of units) for a particular product as a function of 
# advertising budgets (in thousands of dollars)

(200, 5)


Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


# **Simple Linear Regression**

In [None]:
X = adv.iloc[:,1:-1].values
y = adv.iloc[:,4].values

In [None]:
y

array([22.1, 10.4,  9.3, 18.5, 12.9,  7.2, 11.8, 13.2,  4.8, 10.6,  8.6,
       17.4,  9.2,  9.7, 19. , 22.4, 12.5, 24.4, 11.3, 14.6, 18. , 12.5,
        5.6, 15.5,  9.7, 12. , 15. , 15.9, 18.9, 10.5, 21.4, 11.9,  9.6,
       17.4,  9.5, 12.8, 25.4, 14.7, 10.1, 21.5, 16.6, 17.1, 20.7, 12.9,
        8.5, 14.9, 10.6, 23.2, 14.8,  9.7, 11.4, 10.7, 22.6, 21.2, 20.2,
       23.7,  5.5, 13.2, 23.8, 18.4,  8.1, 24.2, 15.7, 14. , 18. ,  9.3,
        9.5, 13.4, 18.9, 22.3, 18.3, 12.4,  8.8, 11. , 17. ,  8.7,  6.9,
       14.2,  5.3, 11. , 11.8, 12.3, 11.3, 13.6, 21.7, 15.2, 12. , 16. ,
       12.9, 16.7, 11.2,  7.3, 19.4, 22.2, 11.5, 16.9, 11.7, 15.5, 25.4,
       17.2, 11.7, 23.8, 14.8, 14.7, 20.7, 19.2,  7.2,  8.7,  5.3, 19.8,
       13.4, 21.8, 14.1, 15.9, 14.6, 12.6, 12.2,  9.4, 15.9,  6.6, 15.5,
        7. , 11.6, 15.2, 19.7, 10.6,  6.6,  8.8, 24.7,  9.7,  1.6, 12.7,
        5.7, 19.6, 10.8, 11.6,  9.5, 20.8,  9.6, 20.7, 10.9, 19.2, 20.1,
       10.4, 11.4, 10.3, 13.2, 25.4, 10.9, 10.1, 16

In [None]:
# Data for Simple Linear Regression
TV = np.reshape(X[:,0], (-1,1))
Radio = np.reshape(X[:,1], (-1,1))
News = np.reshape(X[:,2], (-1,1))

# Simple Linear Regression to TV
TV_lr = LinearRegression()
TV_lr.fit(TV, y)

# Simple Linear Regression to Radio
Radio_lr = LinearRegression()
Radio_lr.fit(Radio, y)

# Simple Linear Regression to Newspaper
News_lr = LinearRegression()
News_lr.fit(News, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
# Visualising the Linear Regression results

fig = go.Figure()

fig.add_trace(go.Scatter(x=TV.T[0], y=y, name='tv train', mode='markers'))
fig.add_trace(go.Scatter(x=Radio.T[0], y=y, name='radio train', mode='markers'))
fig.add_trace(go.Scatter(x=News.T[0], y=y, name='newspaper train', mode='markers'))

fig.add_trace(go.Scatter(x=TV.T[0], y=TV_lr.predict(TV), mode='lines', name='tv regression'))
fig.add_trace(go.Scatter(x=Radio.T[0], y=Radio_lr.predict(Radio), mode='lines', name='radio regression'))
fig.add_trace(go.Scatter(x=News.T[0], y=News_lr.predict(News), mode='lines', name='newspaper regression'))

fig.update_layout(xaxis={"title": "Budgets"},
                  yaxis={"title": "Sales"})

fig.show()

In [None]:
print(TV_lr.coef_[0],TV_lr.intercept_)
print(Radio_lr.coef_[0],Radio_lr.intercept_)
print(News_lr.coef_[0],News_lr.intercept_)

0.04753664043301975 7.032593549127695
0.20249578339243965 9.311638095158283
0.05469309847227336 12.35140706927816


An additional $1,000 spent on TV advertising is associated with selling approximately 47.5 additional units of the product.

# **Accuracy of the Coefficient**

In [None]:
#https://jyotiyadav99111.medium.com/statistics-how-should-i-interpret-results-of-ols-3bde1ebeec01

TV2 = sm.add_constant(TV)
TV_OLS = sm.OLS(y, TV2)
TV_OLS = TV_OLS.fit()
print(TV_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.612
Model:                            OLS   Adj. R-squared:                  0.610
Method:                 Least Squares   F-statistic:                     312.1
Date:                Sat, 17 Apr 2021   Prob (F-statistic):           1.47e-42
Time:                        19:49:52   Log-Likelihood:                -519.05
No. Observations:                 200   AIC:                             1042.
Df Residuals:                     198   BIC:                             1049.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.0326      0.458     15.360      0.0

In [None]:
Radio2 = sm.add_constant(Radio)
Radio_OLS = sm.OLS(y, Radio2)
Radio_OLS = Radio_OLS.fit()
print(Radio_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.332
Model:                            OLS   Adj. R-squared:                  0.329
Method:                 Least Squares   F-statistic:                     98.42
Date:                Sat, 17 Apr 2021   Prob (F-statistic):           4.35e-19
Time:                        19:50:01   Log-Likelihood:                -573.34
No. Observations:                 200   AIC:                             1151.
Df Residuals:                     198   BIC:                             1157.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.3116      0.563     16.542      0.0

In [None]:
News2 = sm.add_constant(News)
News_OLS = sm.OLS(y, News2)
News_OLS = News_OLS.fit()
print(News_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.052
Model:                            OLS   Adj. R-squared:                  0.047
Method:                 Least Squares   F-statistic:                     10.89
Date:                Sat, 17 Apr 2021   Prob (F-statistic):            0.00115
Time:                        19:50:06   Log-Likelihood:                -608.34
No. Observations:                 200   AIC:                             1221.
Df Residuals:                     198   BIC:                             1227.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.3514      0.621     19.876      0.0

# **Accuracy of the Model**

In [None]:
# Residual Standard Error
print(np.sqrt(TV_OLS.scale))
print(np.sqrt(Radio_OLS.scale))
print(np.sqrt(News_OLS.scale))

3.258656368650463
4.27494435490106
5.092480366520192


# **Multiple Linear Regression**

In [None]:
# Linear Regression + Coefficients and Model Evaluation
X2 = sm.add_constant(X)
X2_OLS = sm.OLS(y, X2)
X2_OLS = X2_OLS.fit()
print(X2_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Sat, 17 Apr 2021   Prob (F-statistic):           1.58e-96
Time:                        19:50:13   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9389      0.312      9.422      0.0

In [None]:
# Correlation Matrix
adv[['TV', 'Radio', 'Newspaper','Sales']].corr()

Unnamed: 0,TV,Radio,Newspaper,Sales
TV,1.0,0.054809,0.056648,0.782224
Radio,0.054809,1.0,0.354104,0.576223
Newspaper,0.056648,0.354104,1.0,0.228299
Sales,0.782224,0.576223,0.228299,1.0


In [None]:
X3 = X2[:,[0,1,2]]
X3_OLS = sm.OLS(y, X3)
X3_OLS = X3_OLS.fit()
print(X3_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     859.6
Date:                Sat, 17 Apr 2021   Prob (F-statistic):           4.83e-98
Time:                        13:36:07   Log-Likelihood:                -386.20
No. Observations:                 200   AIC:                             778.4
Df Residuals:                     197   BIC:                             788.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9211      0.294      9.919      0.0

# **Ridge Regression**

In [None]:
# Load the data
crime = pd.read_csv('/content/gdrive/MyDrive/DPES/Data/CommViolPredUnnormalizedData.txt', sep=',', na_values='?')

In [None]:
# target - ViolentCrimesPerPop: total number of violent crimes per 100K popuation
columns_to_keep = [5,6] + list(range(11,26)) + list(range(32,103)) + [145]

crime = crime.iloc[:,columns_to_keep].dropna()

In [None]:
X_crime = crime.iloc[:,range(0,88)]
y_crime = crime['ViolentCrimesPerPop']

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, test_size = 1/3, random_state = 0)

In [None]:
# Simple Linear Regression to the Training set
lr = LinearRegression().fit(X_train,y_train)

In [None]:
print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))

0.6785447201296639
0.5537191295804051


In [None]:
lr.coef_

array([ 8.94619849e-04, -1.62199792e+02,  1.64150270e+01, -3.33332483e+01,
       -7.71939606e-01, -1.60912101e+01, -2.16090961e-03,  1.42731544e+00,
       -1.31113975e-02, -6.24606403e+00,  2.22729834e+01, -5.24145307e+00,
        9.93782811e+00, -2.50974967e+00, -7.27196258e+00,  9.71547088e-03,
        4.68720116e-03,  7.77723388e-03, -3.64492557e+00, -1.61157029e+01,
        9.23282132e+00, -2.10340117e+00,  1.78291423e-01,  7.42997644e+00,
       -4.07038405e+00, -2.20663708e+00,  5.64727521e+00,  9.96948583e+00,
        1.93984475e+02,  1.11316580e+01,  1.81201136e+02, -3.80499284e+02,
        2.33997393e+01, -3.46387716e+00, -2.36772638e+01,  5.44851381e+00,
        7.00089174e-01,  4.45768909e+00, -1.06838362e+01, -1.31311319e-02,
        4.79984098e+01, -1.49785985e-04,  3.37797218e-02,  2.32415023e+00,
       -3.03434838e+00,  1.61134157e+00,  1.34056380e+01, -4.60587199e+01,
        2.48178402e+01, -7.30881821e+00, -1.20530459e+00, -4.03336774e+01,
       -2.13056437e+00, -

In [None]:
# Ridge Regression to the Training set
lr_ridge = Ridge(alpha=20.0).fit(X_train,y_train)

In [None]:
print(lr_ridge.score(X_train,y_train))
print(lr_ridge.score(X_test,y_test))

0.6759501647391476
0.5538877870487899


In [None]:
lr_ridge.coef_

array([ 1.21301626e-03,  7.84554498e+00,  1.31130807e+01, -3.72758098e+01,
        2.96768556e+00, -1.87888441e+01, -2.54993992e-03,  1.56122991e+00,
       -5.65792316e-03, -6.08784447e+00,  1.55861168e+01, -5.37749929e+00,
        8.71328666e+00, -2.41823041e+00, -7.26014440e+00,  7.02801978e-03,
        1.15327580e-04,  8.06611416e-03, -3.36073813e+00, -1.83409408e+01,
        9.55881743e+00, -2.79433490e+00,  3.05723638e+00,  1.05396354e+01,
       -3.99201858e+00, -2.47650825e+00,  5.95800815e+00,  9.03342731e+00,
        2.67991175e+01,  1.15272641e+01, -3.58889793e-01, -3.69626677e+01,
        2.11606815e+01, -2.63592209e+00, -2.12316256e+01,  4.97253809e+00,
        6.50502859e-01,  4.13416796e+00, -1.17247302e+01, -1.30721244e-02,
        4.91505768e+01, -2.00487169e-04,  5.11495872e-01,  1.37366500e+00,
       -2.56677635e+00,  1.29567485e+00, -3.32237285e+00, -2.29802329e+01,
        1.09834279e+01, -1.74161452e+00, -1.29060280e+00, -4.00600750e+01,
        4.26390746e+00, -

In [None]:
# Feature standardization
# X_std = (X - X.min) / (X.max - X.min)
scaler = MinMaxScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

In [None]:
lr_scaler = LinearRegression().fit(X_train_scaler,y_train)
print(lr_scaler.score(X_train_scaler,y_train))
print(lr_scaler.score(X_test_scaler,y_test))

0.6785464370655274
0.5536999901517023


In [None]:
lr_scaler.coef_

array([ 3.10914751e+03, -4.81455810e+02,  7.57058588e+02, -1.95141490e+03,
       -4.93493615e+01, -7.46895851e+02, -7.53033346e+03,  1.42140916e+02,
       -1.45387856e+03, -3.50283705e+02,  1.41321779e+02, -4.22163253e+02,
        6.77764827e+02, -6.69593854e+01, -3.06097257e+02,  1.12988028e+03,
        2.69435629e+02,  5.00759483e+03, -1.76985599e+02, -7.49217655e+02,
        6.14982416e+02, -1.51054922e+02,  3.30555081e+00,  4.46012882e+02,
       -1.88483999e+02, -1.12155372e+02,  2.26806909e+02,  5.57734904e+02,
        3.10733392e+03,  6.86682326e+02,  3.65157030e+03, -6.20739468e+03,
        6.07585815e+01, -2.10289047e+02, -1.58677821e+03,  3.76132836e+02,
        4.23434870e+01,  2.85522798e+02, -5.09562116e+02, -2.78008231e+03,
        1.15302170e+03, -2.09612049e+02, -4.71272767e-01,  1.78333160e+02,
       -2.45737565e+02,  1.44720750e+02,  1.56722448e+02, -8.15255108e+02,
        5.87981223e+02, -2.35136153e+02, -1.11927322e+02, -1.46362099e+03,
       -6.77934301e+01, -

In [None]:
lr_ridge_scaler = Ridge(alpha=5.0).fit(X_train_scaler,y_train)
print(lr_ridge_scaler.score(X_train_scaler,y_train))
print(lr_ridge_scaler.score(X_test_scaler,y_test))

0.6454722126938968
0.594522771462054


In [None]:
lr_lasso = Lasso(alpha=2.0, max_iter=10000).fit(X_train_scaler, y_train)