## Real World Example
[Dataset](https://data.world/exercises/linear-regression-exercise-1)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv('cancer_reg.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
avganncount,1397,173,102,427,57
avgdeathsperyear,469,70,50,202,26
target_deathrate,164.9,161.3,174.7,194.8,144.4
incidencerate,489.8,411.6,349.7,430.4,350.1
medincome,61898,48127,49348,44243,49955
popest2015,260131,43269,21026,75882,10321
povertypercent,11.2,18.6,14.6,17.1,12.5
studypercap,499.748,23.1112,47.5602,342.637,0
binnedinc,"(61494.5, 125635]","(48021.6, 51046.4]","(48021.6, 51046.4]","(42724.4, 45201]","(48021.6, 51046.4]"
medianage,39.3,33,45,42.8,48.3


In [11]:
X = df[['povertypercent','medincome','medianage','medianagemale','pctemployed16_over','pctunemployed16_over','pctprivatecoverage','pctprivatecoveragealone']]
X.head()
X.shape

0.19986872333442732

In [4]:
y = df['avgdeathsperyear']

In [8]:
for col in X.columns:
    print((col, sum(X[col].isnull())))

('povertypercent', 0)
('medincome', 0)
('medianage', 0)
('medianagemale', 0)
('pctemployed16_over', 152)
('pctunemployed16_over', 0)
('pctprivatecoverage', 0)
('pctprivatecoveragealone', 609)


## Dealing With Null Values
[missingpy](https://pypi.org/project/missingpy/)

In [12]:
from missingpy import MissForest
imputer = MissForest(max_iter=10)
X_imputed = imputer.fit_transform(X)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5


In [13]:
col_names = ['povertypercent','medincome','medianage','medianagemale','pctemployed16_over','pctunemployed16_over','pctprivatecoverage','pctprivatecoveragealone']
x = pd.DataFrame(X_imputed, columns= col_names)


In [14]:
from sklearn import preprocessing
x = preprocessing.scale(x)

## Polynomial Regression

In [15]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_features= PolynomialFeatures(degree=3,interaction_only=True, include_bias=True)
x_p = polynomial_features.fit_transform(x)

In [16]:
import statsmodels.api as sm
model = sm.OLS(y, x_p).fit()
ypred = model.predict(x_p) 

In [17]:
model.summary()

0,1,2,3
Dep. Variable:,avgdeathsperyear,R-squared:,0.264
Model:,OLS,Adj. R-squared:,0.241
Method:,Least Squares,F-statistic:,11.51
Date:,"Thu, 31 Oct 2019",Prob (F-statistic):,2.36e-136
Time:,12:09:51,Log-Likelihood:,-22817.0
No. Observations:,3047,AIC:,45820.0
Df Residuals:,2954,BIC:,46380.0
Df Model:,92,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,248.9149,15.916,15.639,0.000,217.708,280.122
x1,196.7033,41.157,4.779,0.000,116.005,277.402
x2,285.3832,35.802,7.971,0.000,215.183,355.583
x3,-25.0732,37.418,-0.670,0.503,-98.440,48.294
x4,77.5618,22.250,3.486,0.000,33.934,121.189
x5,121.5088,26.242,4.630,0.000,70.054,172.963
x6,149.8740,20.502,7.310,0.000,109.675,190.073
x7,-477.0745,63.301,-7.537,0.000,-601.193,-352.956
x8,470.0397,66.523,7.066,0.000,339.603,600.476

0,1,2,3
Omnibus:,5379.334,Durbin-Watson:,1.913
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9719354.917
Skew:,12.149,Prob(JB):,0.0
Kurtosis:,278.618,Cond. No.,1740.0


In [18]:
MSE=sum((ypred-y)**2)/len(y)
print('MSE = ', MSE)

MSE =  187037.8490958749


## Regularization
[Statsmodel_Implementation](https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.fit_regularized.html)

In [34]:
import statsmodels.api as sm
model = sm.OLS(y,x).fit_regularized(method='elastic_net', alpha=1.0, L1_wt=0.0)
ypred = model.predict(x) 

In [32]:
regularized_regression_parameters = model.params
print(regularized_regression_parameters)

[-2.01801494  9.47003903 -1.0248997  -6.56098305  4.63145137  4.233208
  1.38702849  4.8227606 ]


In [33]:
MSE=sum((ypred-y)**2)/len(y)
print('MSE = ', MSE)

MSE =  284197.33541865455
