In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [None]:
data=pd.read_csv('advertising.csv')

In [None]:
data.info()

In [None]:
sns.pairplot(data)
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(data.corr(),annot=True,cmap='coolwarm',linewidth=2)
plt.show()

In [None]:
fig,ax=plt.subplots(1,3,figsize=(18,8))
sns.scatterplot(data=data,x='TV',y='Sales',ax=ax[0])
ax[0].set_title('TV Vs. Sales')
sns.scatterplot(data=data,x='Radio',y='Sales',ax=ax[1])
ax[1].set_title('Radio Vs. Sales')
sns.scatterplot(data=data,x='Newspaper',y='Sales',ax=ax[2])
ax[2].set_title('Newspaper Vs. Sales')
plt.show()

In [None]:
sns.boxplot(data)
plt.show()

In [None]:
X=data.drop('Sales',axis=1)
Y=data['Sales']

In [None]:
!pip install statsmodels

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
import statsmodels.api as sm
X_const=sm.add_constant(X) 
model=sm.OLS(Y,X_const).fit()
predictions=model.predict(X_const)
residuals=model.resid
model.summary()

R-Squared - 90.3% of variance in sales is explained by my model very clearly
Adj R-Squared - 90.1% - no. of predictors with different sample sizes can be effectively predicted and also it is used check the model is overfitting or not
F-statistics :605.4 and p-value:<0.05 statistcally significant model
Sales=4.6251+0.0544TV+0.1070Radio+0.0003*Newspaper 
The t-statistics should be higher and p-vale<0.05 to choose the feature ie. the feature is statistically significant to predict sales Therefore TV and Radio are statistically significant. Newspaper is not statistically significant 

Model Diagnostics:
Omnibus and Jaquar Berra: Residulas are not normally distributed
Durbin Watson value:2.25 check autocorrelation =2 no autocorrelation <2 +ve autocorrelation >2 -ve autocorrelation
There is no autocorrelation since the value is closer to 2 cond.No.:454
>30 multicollinearity
There exists multicolinearity

In [None]:
#normality plot
fig,ax=plt.subplots(1,2,figsize=(18,8))
sm.qqplot(residuals,line='45',fit=True,ax=ax[0])
ax[0].set_title('QQ Plot')
sns.histplot(residuals,kde=True,ax=ax[1])
ax[1].set_title('Histogram')
plt.show()

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
model_lin=LinearRegression()
model_lin.fit(X_train,Y_train)
y_pred=model_lin.predict(X_test) 

In [None]:
model_lin.intercept_

In [None]:
model_lin.coef_

In [None]:
r2=r2_score(Y_test,y_pred)
print('R Squared is',r2)
mse=mean_squared_error(Y_test,y_pred)
print('MSE',mse)
mae=mean_absolute_error(Y_test,y_pred)
print('MAE',mae)

In [None]:
rmse=np.sqrt(mse)
print('RMSE',rmse)

In [None]:
sns.scatterplot(x=predictions,y=residuals)
plt.axhline(y=0,color='r',linestyle='--')
plt.show()

In [None]:
#K fold cross validation for limited dataset
from sklearn.model_selection import cross_val_score,cross_val_predict
cv_score=cross_val_score(LinearRegression(),X,Y,cv=5,
                         scoring='neg_mean_squared_error')
#scoreing -> r2 or explained_variance
print(cv_score) # 10 scores
print(cv_score.mean())

In [None]:
cv_pred=cross_val_predict(LinearRegression(),X,Y,cv=10)
cv_pred.shape

In [None]:
from sklearn.linear_model import Ridge,Lasso
model_ridge=Ridge(alpha=1.0)#multicolinearity
model_ridge.fit(X_train,Y_train)
y_pred_ridge=model_ridge.predict(X_test)
r2=r2_score(Y_test,y_pred_ridge)
print(r2)
mse=mean_squared_error(Y_test,y_pred_ridge)
print(mse)
sns.scatterplot(x=y_pred_ridge,y=Y_test,color='red',label='Ridge Regression')

In [None]:
#high dimension data and perform feature selection by penalizing not important feature
#Lasso
model_lasso=Lasso(alpha=0.01)
model_lasso.fit(X_train,Y_train)
y_pred_lasso=model_lasso.predict(X_test)
r2=r2_score(Y_test,y_pred_lasso)
print(r2)
mse=mean_squared_error(Y_test,y_pred_lasso)
print(mse)
sns.scatterplot(x=y_pred_lasso,y=Y_test,color='blue',label='Lasso')

FEATURE SELECTION REGRESSION

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import f_regression,RFE

In [None]:
df=pd.read_csv('Admission_Predict.csv')
df.columns

In [None]:
data.info()

In [None]:
#clean the column
df.columns=df.columns.str.strip()
df.columns

In [None]:
df.head()

In [None]:
import statsmodels.api as sm
X=df.drop('Chance of Admit',axis=1)
X_const=sm.add_constant(X)
vif_features=pd.DataFrame()
vif_features['features']=X_const.columns
vif_features['VIF']= [variance_inflation_factor(X_const.values,i)
                      for i in range(X_const.shape[1])]
vif_features

In [None]:
from sklearn.model_selection import train_test_split
Y=df['Chance of Admit']
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2
                                               ,random_state=42)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
#filter method ANOVA
f_score,p_value=f_regression(X_train,Y_train)
#wrapper method
rfe=RFE(LinearRegression(),n_features_to_select=5)
rfe.fit(X_train,Y_train)
features_selected=pd.DataFrame({
    'Features':X_train.columns,
    'F_Score':f_score,
    'P_Value':p_value,
    'RFE Ranking':rfe.ranking_
})
features_selected.sort_values(by='RFE Ranking')

In [None]:
print(X_train.columns)
print(rfe.support_)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(df.corr(),annot=True,cmap='coolwarm',linewidths=1)
plt.show()