In [None]:
!pip install statsmodels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [None]:
df = pd.read_csv("Advertising.csv")
df.head()

In [None]:
df.drop(columns='Unnamed: 0',inplace=True)
df.head()

In [None]:
df.shape

In [None]:
X = df.drop(columns='sales')
y = df['sales']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=4,test_size=0.3)
X_train.shape,X_test.shape

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [None]:
lr.coef_

In [None]:
lr.intercept_

In [None]:
X_test[:5]

In [None]:
y_pred_test = lr.predict(X_test)

In [None]:
y_pred_test[:5]

# Evaluation of Linear Regression

In [None]:
#Mean Absolute Error
#Mean Squared Error
#Root mean Squared Error
#R2 score

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(f"MSE is {mean_squared_error(y_test,y_pred_test)}")
print(f"RMSE is {mean_squared_error(y_test,y_pred_test, squared=False)}")
print(f"MAE is {mean_absolute_error(y_test,y_pred_test)}")

In [None]:
#R2 score
# How much variance can be explained by the given features
print(f"R2-score is {r2_score(y_test,y_pred_test)}")

In [None]:
# Regularization of Linear Regression
#Lasso Regularization (L1)
# Tends to make the coefficients to absolute zero
# add the absolute vale of magnitude of coefficient as penalty term to loss function
from sklearn.linear_model import Lasso
las = Lasso()
las.fit(X_train,y_train)
y_pred_test = las.predict(X_test)
print(f"R2-score is {r2_score(y_test,y_pred_test)}")

In [None]:
# Ridge Regularization (L2)
# Add squared magnitude of coefficients as penalty term for the loss funtion
# Result in never set the value of coefficients to absolute zero
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X_train,y_train)
y_pred_test = las.predict(X_test)
print(f"R2-score is {r2_score(y_test,y_pred_test)}")

In [None]:
# ElasticNet Regularization (L1 + L2)
# Add squared magnitude of coefficients as penalty term for the loss funtion
# Result in never set the value of coefficients to absolute zero
from sklearn.linear_model import ElasticNet
elasticnet = ElasticNet()
elasticnet.fit(X_train,y_train)
y_pred_test = elasticnet.predict(X_test)
print(f"R2-score is {r2_score(y_test,y_pred_test)}")

In [None]:
# Assumptions of Linear Regression
# Linear relationship between feature and target
df.head()

In [None]:
import seaborn as sns
sns.pairplot(df,x_vars=['TV','radio','newspaper'],y_vars=['sales'])
plt.show()

In [None]:
corr = df.corr()
corr

In [None]:
# mean residuals = 0
residual = (y_test - y_pred_test)
np.mean(residual)

In [None]:
# Normal distribution of error terms
sns.distplot(residual)
plt.show()

In [None]:
# Multi collinearity
# Vif score

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
def vif_score(X):
    X_arr = X.values
    vif = [variance_inflation_factor(X_arr,i) for i in range(X.shape[1]) ]
    vif_score = pd.DataFrame()
    vif_score['vif_score'] = vif
    vif_score['Features'] = X.columns
    return vif_score

In [None]:
vif_score(X)

In [None]:
# IF VIF_SCORE > 4 --> Multicollinearity in the dataset (drop the column)
# IF VIF_SCORE < 4 --> No Multicollinearity in the dataset