## Real World Example
[Dataset](https://data.world/exercises/linear-regression-exercise-1)

In [162]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [163]:
df = pd.read_csv('cancer_reg.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
avganncount,1397,173,102,427,57
avgdeathsperyear,469,70,50,202,26
target_deathrate,164.9,161.3,174.7,194.8,144.4
incidencerate,489.8,411.6,349.7,430.4,350.1
medincome,61898,48127,49348,44243,49955
popest2015,260131,43269,21026,75882,10321
povertypercent,11.2,18.6,14.6,17.1,12.5
studypercap,499.748,23.1112,47.5602,342.637,0
binnedinc,"(61494.5, 125635]","(48021.6, 51046.4]","(48021.6, 51046.4]","(42724.4, 45201]","(48021.6, 51046.4]"
medianage,39.3,33,45,42.8,48.3


In [164]:
X = df[['povertypercent','medincome','medianage','medianagemale','pctemployed16_over','pctunemployed16_over','pctprivatecoverage','pctprivatecoveragealone']]
X.head()

Unnamed: 0,povertypercent,medincome,medianage,medianagemale,pctemployed16_over,pctunemployed16_over,pctprivatecoverage,pctprivatecoveragealone
0,11.2,61898,39.3,36.9,51.9,8.0,75.1,
1,18.6,48127,33.0,32.2,55.9,7.8,70.2,53.8
2,14.6,49348,45.0,44.0,45.9,7.0,63.7,43.5
3,17.1,44243,42.8,42.2,48.3,12.1,58.4,40.3
4,12.5,49955,48.3,47.8,48.2,4.8,61.6,43.9


In [165]:
y = df['avgdeathsperyear']

In [166]:
for col in X.columns:
    print((col, sum(X[col].isnull())))

('povertypercent', 0)
('medincome', 0)
('medianage', 0)
('medianagemale', 0)
('pctemployed16_over', 152)
('pctunemployed16_over', 0)
('pctprivatecoverage', 0)
('pctprivatecoveragealone', 609)


## Dealing With Null Values
[missingpy](https://pypi.org/project/missingpy/)

In [167]:
from missingpy import MissForest
imputer = MissForest(max_iter=10)
X_imputed = imputer.fit_transform(X)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [222]:
col_names = ['povertypercent','medincome','medianage','medianagemale','pctemployed16_over','pctunemployed16_over','pctprivatecoverage','pctprivatecoveragealone']
x = pd.DataFrame(X_imputed, columns= col_names)


In [227]:
from sklearn import preprocessing
x = preprocessing.scale(x)

## Polynomial Regression

In [223]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_features= PolynomialFeatures(degree=3,interaction_only=True, include_bias=True)
x_p = polynomial_features.fit_transform(x)

In [224]:
import statsmodels.api as sm
model = sm.OLS(y, x_p).fit()
ypred = model.predict(x_p) 

In [225]:
model.summary()

0,1,2,3
Dep. Variable:,avgdeathsperyear,R-squared:,0.265
Model:,OLS,Adj. R-squared:,0.242
Method:,Least Squares,F-statistic:,11.56
Date:,"Thu, 31 Oct 2019",Prob (F-statistic):,4.98e-137
Time:,10:39:01,Log-Likelihood:,-22816.0
No. Observations:,3047,AIC:,45820.0
Df Residuals:,2954,BIC:,46380.0
Df Model:,92,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,248.5048,15.892,15.637,0.000,217.344,279.666
x1,194.3330,40.678,4.777,0.000,114.573,274.093
x2,285.0924,35.713,7.983,0.000,215.068,355.117
x3,-22.5155,35.461,-0.635,0.526,-92.047,47.016
x4,76.5344,22.396,3.417,0.001,32.622,120.447
x5,118.8054,26.103,4.551,0.000,67.623,169.988
x6,149.1568,20.421,7.304,0.000,109.116,189.198
x7,-479.2555,64.001,-7.488,0.000,-604.746,-353.765
x8,471.8836,67.358,7.006,0.000,339.811,603.956

0,1,2,3
Omnibus:,5367.78,Durbin-Watson:,1.913
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9619846.742
Skew:,12.093,Prob(JB):,0.0
Kurtosis:,277.202,Cond. No.,1740.0


In [226]:
MSE=sum((ypred-y)**2)/len(y)
print('MSE = ', MSE)

MSE =  186822.91683157967


## Regularization
[Statsmodel_Implementation](https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.fit_regularized.html)

In [219]:
import statsmodels.api as sm
model = sm.OLS(y,x).fit_regularized(method='elastic_net', alpha=1.0, L1_wt=0.8)
ypred = model.predict(x) 

In [220]:
regularized_regression_parameters = model.params
print(regularized_regression_parameters)

x1      0.000000
x2    106.884080
x3     -4.473559
x4    -34.862059
x5     30.052602
x6     74.254708
x7    -32.170321
x8     20.473217
dtype: float64


In [221]:
MSE=sum((ypred-y)**2)/len(y)
print('MSE = ', MSE)

MSE =  264698.727726802
