# In Regression Part


## Let's Implement Backward Elimination

    - Backward elimination starts from a full regression model with all variables and then removes variables one by one
    
============================================================================

### Importing Modules

In [1]:
from sklearn import linear_model
import numpy as np
import scipy
import pandas as pd
from scipy.stats import f

### Importing Dataset - Cement.txt

In [2]:
fname='dataset_cement.txt'
df=pd.read_table(fname, sep='\t',names=['aluminate','silcate','ferrite','dical_sil','hardening'])
y=df['hardening'].values

df.head()

Unnamed: 0,aluminate,silcate,ferrite,dical_sil,hardening
0,7,26,6,60,78.5
1,1,29,15,52,74.3
2,11,56,8,20,104.3
3,11,31,8,47,87.6
4,7,52,6,33,95.9


### Define User's Functions : SSR, MSE and Partial F

In [3]:
def SSR(y_pred, y_real):
    y_mean = y_real.mean()
    
    ssr = sum((y_pred-y_mean)**2)
    
    return ssr

def MSE(y_pred, y_real, p_num):
    
    sse = sum((y_pred-y_real)**2)
    mse = sse/(len(y_pred)-p_num-1)
    
    return mse

def Partial_F(before_ssr, after_ssr, mse):
    
    f_value = (before_ssr -after_ssr)/mse
    
    return f_value

### Modeling Linear Regression and Specifying p-value as variable removal condition

In [4]:
clf=linear_model.LinearRegression(normalize=True)
p_remove=0.15

### Implementing Backward Elimination

In [5]:
selected_cols = ['aluminate','silcate','ferrite','dical_sil']

while len(selected_cols)>1: 

    p_num= len(selected_cols)
    clf.fit(df.loc[:,selected_cols], y)
    y_pred = clf.predict(df.loc[:,selected_cols])

    intial_f = (SSR(y_pred, y)/p_num) / MSE(y_pred, y, p_num) # MSR/MSE = F
    intial_ssr = SSR(y_pred, y)
    intial_mse = MSE(y_pred, y, p_num)

    partial_f_values=[]
    ssr_list=[]
    f_p_value = []

    for col in selected_cols:

        inputs = df.loc[:,selected_cols].drop(col,axis=1,inplace=False).values

        clf.fit(inputs, y)
        y_pred=clf.predict(inputs)

        ssr_list.append(SSR(y_pred, y))

        partial_f_values.append((intial_ssr-ssr_list[-1])/intial_mse)        
        f_p_value.append(f.cdf(partial_f_values[-1], p_num-1 ,len(y_pred)-p_num-2))

    if p_remove > f_p_value[np.argmin(partial_f_values)]:
        selected_cols.remove(selected_cols[np.argmin(partial_f_values)])
    else :
        print("Backward Elimination is over. '"+ str(" & ".join(selected_cols))+"' are final selected variables")
        break  

Backward Elimination is over. 'aluminate & silcate & dical_sil' are final selected variables
