# MPC Project : Predicting house prices 

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import math
from sklearn.model_selection import train_test_split
from math import exp
from sklearn.model_selection import KFold, cross_val_score
%matplotlib inline

In [34]:
# Loading data
houses = pd.read_csv('houses.csv', sep = ',',index_col=0)
houses_new = pd.read_csv('houses_competition.csv', sep = ',',index_col=0)
print (houses)
# Train / Test split for houses:
train , test = train_test_split(houses, test_size = 0.30,random_state=5)


        price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
6681    3.500         3       2.25         1860      8378     2.0         0.0   
17798   5.925         4       3.00         2170      8240     1.0         0.0   
18854   2.555         2       1.00         1440     43560     1.0         0.0   
13478  13.300         4       2.25         3260      4640     2.0         0.0   
10509   3.891         2       1.00          840      5400     1.0         0.0   
...       ...       ...        ...          ...       ...     ...         ...   
16125   2.900         2       1.00          930      7740     1.0         0.0   
19004   3.150         3       2.50         1730      6368     2.0         0.0   
9094    6.850         3       2.50         3450      8000     3.0         0.0   
3537    3.260         6       1.50         1930      8400     1.0         0.0   
10054   3.150         2       2.25         1290      2436     2.0         0.0   

       view  condition  gra

### ALL FUNCTIONS NEEDED FOR MODELLING, SELECTING VARIABLES AND PREDICTING

In [35]:
"""
Function which creates regression model
"""
def my_regression(data,idx_p,idx_t):
    X = data.iloc[:,idx_p]
    X = sm.add_constant(X) 
    Y = data[data.columns.values.tolist()[idx_t]]
    model = sm.OLS(Y, X).fit()
    return model

"""
Function which makes prediction by any model on some data
"""
def my_prediction(my_model,data):
    ranges=[]
    for col in my_model.model.exog_names:
        if(col!='const'):
            ranges.append(data.columns.get_loc(col))
    X_new = data.iloc[:,ranges]
    X_new = sm.add_constant(X_new) # add the constant column
    return my_model.predict(X_new)

"""
Function which computes generalization error
"""
def generalization_error_split(train,test,idx_p,idx_t):
    model = my_regression(train,idx_p,idx_t)
    prediction = my_prediction(model,test)
    return np.mean((test[test.columns.values.tolist()[idx_t]] - prediction)**2)

# FORWARD SELECTION

## Rsquared


In [36]:
#Strict stopping
"""
Function which implements first part of step selection for rsquared
"""
def step_selection_adj(train,v_s,v_nu,idx_t):
    rsquareds=[] #To gather all the rsquareds adjusted.
    for var in v_nu:
        selects=[]
        for local in v_s:
            selects.append(local)
        selects.append(var)
        model = my_regression(train,selects,idx_t)
        rsquareds.append(model.rsquared_adj)
    index_max = np.argmax(rsquareds) #We find the highest rsquared_adj
    return v_nu[index_max]

"""
Function which implements second part of step selection for rsquared
"""
def foward_selection_adj(train,idx_p,idx_t):
    v_s=[]
    v_nu=idx_p.copy()
    perf=-1
    stop = False
    while stop == False and len(v_nu)>0:
        var = step_selection_adj(train,v_s,v_nu,idx_t)
        selects = v_s.copy()
        selects.append(var)
        if my_regression(train,selects,idx_t).rsquared_adj>perf:
            perf=my_regression(train,selects,idx_t).rsquared_adj
            v_s.append(var)
            v_nu.remove(var)
        else :
            stop = True
    v_s.sort()
    return(v_s)

In [37]:
#Delta
"""
Function which implements second part of step selection for rsquared
"""
def delta_foward_selection_adj(train,idx_p,idx_t):
    v_s=[]
    result = []
    v_nu=idx_p.copy()
    perf=-1
    stop = False
    delta = 3 #Number of iterations
    i=0
    while (stop == False and len(v_nu)>0 and len(v_s)>0) or i<delta:
        var = step_selection_adj(train,v_s,v_nu,idx_t)
        selects = v_s.copy()
        selects.append(var)
        if my_regression(train,selects,idx_t).rsquared_adj>perf:
            perf=my_regression(train,selects,idx_t).rsquared_adj
            v_s.append(var)
            v_nu.remove(var)
            i=0
            result=v_s.copy()
        else :
            i+=1
            v_s.append(var)
            v_nu.remove(var)
            if(i==delta):
                stop=True
    result.sort()
    return(result)

In [38]:
#Continuous
"""
Function which implements second part of step selection for rsquared
We gather all the best models by rsquaredadj and find the one with the best rsquared adj
"""
def c_foward_selection_adj(train,idx_p,idx_t):
    v_s=[]
    v_nu=idx_p.copy()
    models=[]
    models_dict={}
    while len(v_nu)>0:
        var = step_selection_adj(train,v_s,v_nu,idx_t)
        v_s.append(var)
        v_nu.remove(var)
        model = v_s.copy()
        models.append(model)
        models_dict[my_regression(train,model,idx_t).rsquared_adj]=model
    rsquareds=[]
    for model in models:
        rsquareds.append(my_regression(train,model,idx_t).rsquared_adj)
    print(models_dict)
    index_max = np.argmax(rsquareds)
    result=models[index_max]
    result.sort()
    return(result)

## Generalization error

In [39]:
#Strict stopping
"""
Function which implements first part of step selection for generalization error
"""
def step_selection_gen(train, valid, v_s,v_nu,idx_t):
    gens=[]
    for var in v_nu:
        selects=[]
        for local in v_s:
            selects.append(local)
        selects.append(var)
        gens.append(generalization_error_split(train,valid,selects,idx_t))
    index_min = np.argmin(gens)
    return v_nu[index_min]

"""
Function which implements second part of step selection for generalization error
"""
def foward_selection_gen(train,idx_p,idx_t):
    train1 , valid = train_test_split(train, test_size = 0.30,random_state=5) 
    v_s=[]
    v_nu=idx_p.copy()
    perf=np.inf
    stop = False
    while stop == False and len(v_nu)>0:
        var = step_selection_gen(train1,valid,v_s,v_nu,idx_t)
        selects = v_s.copy()
        selects.append(var)
        if generalization_error_split(train1,valid,selects,idx_t) < perf:
            perf=generalization_error_split(train1,valid,selects,idx_t)
            v_s.append(var)
            v_nu.remove(var)
        else :
            stop = True
    v_s.sort()
    return(v_s)

In [40]:
#Delta
"""
Function which implements second part of step selection for generalization error
"""
def delta_foward_selection_gen(train,idx_p,idx_t):
    train1 , valid = train_test_split(train, test_size = 0.30,random_state=5) 
    v_s=[]
    result = []
    v_nu=idx_p.copy()
    perf=np.inf
    stop = False
    delta = 3
    i=0
    while (stop == False and len(v_nu)>0 and len(v_s)>0) or i<delta:
        var = step_selection_gen(train1,valid,v_s,v_nu,idx_t)
        selects = v_s.copy()
        selects.append(var)
        if generalization_error_split(train1,valid,selects,idx_t) < perf:
            perf=generalization_error_split(train1,valid,selects,idx_t)
            v_s.append(var)
            v_nu.remove(var)
            i=0
            result=v_s.copy()
        else :
            i+=1
            v_s.append(var)
            v_nu.remove(var)
            if(i==delta):
                stop=True
    result.sort()
    return(result)

In [41]:
#Continuous
"""
Function which implements second part of step selection for generalization error
"""
def c_foward_selection_gen(train,idx_p,idx_t):
    train1 , valid = train_test_split(train, test_size = 0.30,random_state=5) 
    v_s=[]
    models=[]
    v_nu=idx_p.copy()
    models_dict={}
    while len(v_nu)>0:
        var = step_selection_gen(train1,valid,v_s,v_nu,idx_t)
        v_s.append(var)
        v_nu.remove(var)
        model=v_s.copy()
        models.append(model)
        models_dict[generalization_error_split(train1,valid,model,idx_t)]=model
    gens=[]
    for model in models:
            gens.append(generalization_error_split(train1,valid,model,idx_t))
    index_min = np.argmin(gens)
    print(models_dict)
    result = models[index_min]
    result.sort()
    return(result)

## Pvalues

In [42]:
#Strict stopping
"""
Function which implements first part of step selection for generalization error
We find the best variable by lowest pvalue. We cannot filter by pvalue<=0.05 yet.
"""
def step_selection_p(train,v_s,v_nu,idx_t):
    pvalues=[]
    for var in v_nu:
        selects=[]
        for local in v_s:
            selects.append(local)
        selects.append(var)
        model = my_regression(train,selects,idx_t)
        pvalues.append(model.pvalues[1])
    index_min = np.argmin(pvalues)
    print(v_nu[index_min],pvalues[index_min])
    return v_nu[index_min]

"""
Function which implements second part of step selection for generalization error
We can filter by pvalue<=0.05. In order to find the best, we stop where the next one is significant
"""
def foward_selection_p(train,idx_p,idx_t):
    v_s=[]
    v_nu=idx_p.copy()
    perf=False
    stop = False
    good = False
    while stop == False and len(v_nu)>0:
        var = step_selection_p(train,v_s,v_nu,idx_t)
        selects = v_s.copy()
        selects.append(var)
        model = my_regression(train,selects,idx_t)
        perf = model.pvalues[[i for i in range(1,len(selects))]]<=0.05
        print(perf)
        if False in perf.values:
            good=False
        else:
            good=True
        print(good)
        if(good==True):
            v_s.append(var)
            v_nu.remove(var)
        else:
            stop=True
    v_s.sort()
    return(v_s)

In [43]:
#Delta
"""
Function which implements second part of step selection for generalization error
"""
def delta_foward_selection_p(train,idx_p,idx_t):
    train1 , valid = train_test_split(train, test_size = 0.30) 
    v_s=[]
    v_nu=idx_p.copy()
    perf=np.inf
    stop = False
    while stop == False and len(v_nu)>0:
        var = step_selection_p(train1,valid,v_s,v_nu,idx_t)
        selects = v_s.copy()
        selects.append(var)
        if generalization_error_split(train1,valid,selects,idx_t) < perf:
            perf=generalization_error_split(train1,valid,selects,idx_t)
            v_s.append(var)
            v_nu.remove(var)
        else :
            stop = True
    v_s.sort()
    return(v_s)

In [44]:
#Continuous
"""
Function which implements second part of step selection for generalization error
In order to find the best, we gather all the models. 
First, we filter by choosing models who don't contain no significant variable.
Second, we choose the best model by generalisation error. To do so, we use the test split from the real houses data (named train).
"""
def c_foward_selection_p(train,test,idx_p,idx_t):
    v_s=[]
    models=[]
    v_nu=idx_p.copy()
    while len(v_nu)>0:
        var = step_selection_p(train,v_s,v_nu,idx_t)
        v_s.append(var)
        v_nu.remove(var)
        model=v_s.copy()
        models.append(model)
    print(models)
    gens=[]
    perfs=[]
    finals=[]
    for model in models:
        perf=my_regression(train,model,idx_t).pvalues[[i for i in range(1,len(model))]]<=0.05
        perfs.append(perf)
    for model in models:
        if (False not in perfs[models.index(model)].values):
            finals.append(model)
    for model in finals:
        gens.append(generalization_error_split(train,test,model,idx_t))
    index_min = np.argmin(gens)
    result = models[index_min]
    result.sort()
    print(my_regression(train,result,idx_t).pvalues[[i for i in range(1,len(result))]]<=0.05)
    return(result)

In [45]:
"""
Function which adds polynomial features to variables that explain target variable
"""
def add_polynomial_feature(data, idx_p, power):
    new_data = data.copy(deep = True)
    for i in range(0, len(idx_p)):
        for j in power:
            for k in range(2, j+1):
                new_data['{}_pow_{}'.format(new_data.columns[idx_p[i]],k)] = new_data.iloc[:,idx_p[i]]**k
    return(new_data)

# DATA FOR NON-LINEAR REGRESSION

## I- SIMPLE LINEAR REGRESSION

In [46]:
gens=[]
for i in range(1,len(test.columns)):
    mse = generalization_error_split(train,test,i,0)
    gens.append(mse)
    print(test.columns[i])
print(gens)
index_mint = np.argmin(gens)+1 #Because we have price which is 1
modelsimple = my_regression(houses,index_mint,0)
modelsimple.summary()

bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15
[11.97152747664599, 9.1912892158212, 6.528861183289393, 12.872168109274229, 11.786838650789518, 12.043192123875176, 10.88388716570048, 12.932567474395945, 7.043386826290524, 7.859229190321266, 11.816525316930369, 12.894049063948652, 12.824287846405896, 12.908346253404964, 11.572645215546652, 12.92671145535201, 8.301113957182567, 12.85473468729375]


0,1,2,3
Dep. Variable:,price,R-squared:,0.496
Model:,OLS,Adj. R-squared:,0.496
Method:,Least Squares,F-statistic:,13200.0
Date:,"Tue, 02 May 2023",Prob (F-statistic):,0.0
Time:,01:08:34,Log-Likelihood:,-32042.0
No. Observations:,13397,AIC:,64090.0
Df Residuals:,13395,BIC:,64100.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.5466,0.057,-9.646,0.000,-0.658,-0.436
sqft_living,0.0029,2.49e-05,114.907,0.000,0.003,0.003

0,1,2,3
Omnibus:,9718.781,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,441688.826
Skew:,2.993,Prob(JB):,0.0
Kurtosis:,30.485,Cond. No.,5650.0


## II- MULTIPLE LINEAR REGRESSION

In [47]:
modelmultiple = my_regression(houses,[i for i in range(1,len(test.columns))],0)
modelmultiple.summary()
print(generalization_error_split(train,test,[i for i in range(1,len(test.columns))],0))
value = modelmultiple.pvalues[[i for i in range(1,len(test.columns))]]<=0.05

3.828042477695651


## III- VARIABLE SELECTION

### 1- FORWARD SELECTION

In [48]:
# 1-1- BASED ON RSQUARED ADJUSTED

# 1-1-a- Strict stopping criterion
selected_by_rsquared = foward_selection_adj(train,[i for i in range(1,len(train.columns))],0)
print(selected_by_rsquared)
print(generalization_error_split(train,test,selected_by_rsquared,0))

# 1-1-b- Delta iteration criterion
selected_by_delta_rsquared = delta_foward_selection_adj(train,[i for i in range(1,len(train.columns))],0)
print(selected_by_delta_rsquared)
print(generalization_error_split(train,test,selected_by_delta_rsquared,0))

# 1-1-c- Continuous criterion
selected_by_continuous_rsquared = c_foward_selection_adj(train,[i for i in range(1,len(train.columns))],0)
print(selected_by_continuous_rsquared)
print(generalization_error_split(train,test,selected_by_continuous_rsquared,0))

gens=[generalization_error_split(train,test,selected_by_rsquared,0),generalization_error_split(train,test,selected_by_delta_rsquared,0),generalization_error_split(train,test,selected_by_continuous_rsquared,0)]
vars = [selected_by_rsquared,selected_by_delta_rsquared,selected_by_continuous_rsquared]
index_min = np.argmin(gens)
selectedbyrsquared = vars[index_min]
print(selectedbyrsquared)

modelrsquaredforward = my_regression(houses,selectedbyrsquared,0)
modelrsquaredforward.summary()
my_prediction(modelrsquaredforward,houses_new)



[1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18]
3.828790195822256
[1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18]
3.828790195822256
{0.4965996647057377: [3], 0.5673234852878521: [3, 15], 0.6052345382611868: [3, 15, 6], 0.6297135358020418: [3, 15, 6, 9], 0.6695178638312591: [3, 15, 6, 9, 12], 0.676184455344699: [3, 15, 6, 9, 12, 7], 0.6806878443804338: [3, 15, 6, 9, 12, 7, 1], 0.6852531888584019: [3, 15, 6, 9, 12, 7, 1, 2], 0.6880300297515674: [3, 15, 6, 9, 12, 7, 1, 2, 14], 0.6913914900047977: [3, 15, 6, 9, 12, 7, 1, 2, 14, 16], 0.6925707406143718: [3, 15, 6, 9, 12, 7, 1, 2, 14, 16, 11], 0.6937087396105793: [3, 15, 6, 9, 12, 7, 1, 2, 14, 16, 11, 8], 0.6941105325700836: [3, 15, 6, 9, 12, 7, 1, 2, 14, 16, 11, 8, 13], 0.694325143286076: [3, 15, 6, 9, 12, 7, 1, 2, 14, 16, 11, 8, 13, 18], 0.6944571087985782: [3, 15, 6, 9, 12, 7, 1, 2, 14, 16, 11, 8, 13, 18, 17], 0.6944560977732497: [3, 15, 6, 9, 12, 7, 1, 2, 14, 16, 11, 8, 13, 18, 17, 10], 0.6944258155351066: [3, 15, 6, 9, 12, 7

15529     3.890133
3233      3.334325
14381     6.162741
3201     12.231634
3425      1.982928
           ...    
7989      4.949858
9973      3.586979
9007      4.250436
16345    13.749974
13582     0.375648
Length: 2365, dtype: float64

In [49]:
# 1-2- BASED ON GENERALISATION ERROR

# 1-1-a- Strict stopping criterion
selected_by_gen = foward_selection_gen(train,[i for i in range(1,len(train.columns))],0)
print(selected_by_gen)
print(generalization_error_split(train,test,selected_by_gen,0))

# 1-1-b- Delta iteration criterion
selected_by_gen_delta = delta_foward_selection_gen(train,[i for i in range(1,len(train.columns))],0)
print(selected_by_gen_delta)
print(generalization_error_split(train,test,selected_by_gen_delta,0))

# 1-1-c- Continuous criterion
selected_by_continuous_gen = c_foward_selection_gen(train,[i for i in range(1,len(train.columns))],0)
print(selected_by_continuous_gen)
print(generalization_error_split(train,test,selected_by_continuous_gen,0))

gens=[generalization_error_split(train,test,selected_by_gen,0),generalization_error_split(train,test,selected_by_gen_delta,0),generalization_error_split(train,test,selected_by_continuous_gen,0)]
vars = [selected_by_gen,selected_by_gen_delta,selected_by_continuous_gen]
index_min = np.argmin(gens)
selectedbygen=vars[index_min]
print(selectedbygen)

modelgenforward=my_regression(houses,selectedbygen,0)
modelgenforward.summary()
#my_prediction(modelgenforward,houses_new)

[1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17]
3.8306615031281295
[1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17]
3.8306615031281295
{5.776177880221217: [3], 4.767671875821275: [3, 15], 4.35828884973035: [3, 15, 7], 4.000935108871944: [3, 15, 7, 9], 3.660705109751807: [3, 15, 7, 9, 12], 3.5270855470950715: [3, 15, 7, 9, 12, 6], 3.490067751099924: [3, 15, 7, 9, 12, 6, 2], 3.4487686701085605: [3, 15, 7, 9, 12, 6, 2, 1], 3.41290930783081: [3, 15, 7, 9, 12, 6, 2, 1, 14], 3.3983797357833683: [3, 15, 7, 9, 12, 6, 2, 1, 14, 11], 3.373882421644576: [3, 15, 7, 9, 12, 6, 2, 1, 14, 11, 16], 3.3571030975198566: [3, 15, 7, 9, 12, 6, 2, 1, 14, 11, 16, 8], 3.347490348266162: [3, 15, 7, 9, 12, 6, 2, 1, 14, 11, 16, 8, 13], 3.34409096049492: [3, 15, 7, 9, 12, 6, 2, 1, 14, 11, 16, 8, 13, 17], 3.345963069556161: [3, 15, 7, 9, 12, 6, 2, 1, 14, 11, 16, 8, 13, 17, 10], 3.349688344950397: [3, 15, 7, 9, 12, 6, 2, 1, 14, 11, 16, 8, 13, 17, 10, 5], 3.35696670875732: [3, 15, 7, 9, 12, 6, 2, 1, 14, 11, 16, 8,

0,1,2,3
Dep. Variable:,price,R-squared:,0.698
Model:,OLS,Adj. R-squared:,0.698
Method:,Least Squares,F-statistic:,2209.0
Date:,"Tue, 02 May 2023",Prob (F-statistic):,0.0
Time:,01:08:44,Log-Likelihood:,-28618.0
No. Observations:,13397,AIC:,57270.0
Df Residuals:,13382,BIC:,57380.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,26.3896,37.204,0.709,0.478,-46.535,99.314
bedrooms,-0.3631,0.024,-15.116,0.000,-0.410,-0.316
bathrooms,0.4775,0.041,11.697,0.000,0.397,0.558
sqft_living,0.0019,4.62e-05,41.588,0.000,0.002,0.002
waterfront,6.4389,0.235,27.457,0.000,5.979,6.899
view,0.5118,0.028,18.340,0.000,0.457,0.566
condition,0.2469,0.030,8.125,0.000,0.187,0.306
grade,0.9345,0.028,33.606,0.000,0.880,0.989
sqft_basement,-0.0004,4.97e-05,-8.294,0.000,-0.001,-0.000

0,1,2,3
Omnibus:,11934.92,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1424452.366
Skew:,3.792,Prob(JB):,0.0
Kurtosis:,52.943,Cond. No.,206000000.0


In [50]:
# 1-3- BASED ON CRITICAL PROBABILITY

# 1-1-a- Strict stopping criterion
selected_by_pvalue=foward_selection_p(train,[i for i in range(1,len(train.columns))],0)
print(selected_by_pvalue)
print(generalization_error_split(train,test,selected_by_pvalue,0))
#modelStrictCritcProb = my_regression(houses,foward_selection_p(train,[i for i in range(2,len(train.columns))],1),1)


# 1-1-b- Delta iteration criterion

# 1-1-c- Continuous criterion
selected_by_pvalues_continue = c_foward_selection_p(train,test,[i for i in range(1,len(train.columns))],0)
print(selected_by_pvalues_continue)
print(generalization_error_split(train,test,selected_by_pvalues_continue,0))

gens=[generalization_error_split(train,test,selected_by_pvalue,0),generalization_error_split(train,test,selected_by_pvalues_continue,0),]
vars = [selected_by_pvalue,selected_by_pvalues_continue]
index_min = np.argmin(gens)
selectedbypvalues=vars[index_min]
print(selectedbypvalues)

2 0.0
Series([], dtype: bool)
True
1 0.0
bathrooms    True
dtype: bool
True
4 0.0
bathrooms    True
bedrooms     True
dtype: bool
True
5 0.0
bathrooms    True
bedrooms     True
sqft_lot     True
dtype: bool
True
6 0.0
bathrooms     True
bedrooms      True
sqft_lot      True
floors       False
dtype: bool
False
[1, 2, 4, 5]
9.176074327093731
2 0.0
1 0.0
4 0.0
5 0.0
6 0.0
8 0.0
12 0.0
7 0.0
11 0.0
13 0.0
14 0.0
16 0.0
15 0.0
18 0.0
17 2.7780052005869945e-178
9 2.837186756072864e-100
10 2.3182511500740393e-30
3 2.0016906199978574e-25
[[2], [2, 1], [2, 1, 4], [2, 1, 4, 5], [2, 1, 4, 5, 6], [2, 1, 4, 5, 6, 8], [2, 1, 4, 5, 6, 8, 12], [2, 1, 4, 5, 6, 8, 12, 7], [2, 1, 4, 5, 6, 8, 12, 7, 11], [2, 1, 4, 5, 6, 8, 12, 7, 11, 13], [2, 1, 4, 5, 6, 8, 12, 7, 11, 13, 14], [2, 1, 4, 5, 6, 8, 12, 7, 11, 13, 14, 16], [2, 1, 4, 5, 6, 8, 12, 7, 11, 13, 14, 16, 15], [2, 1, 4, 5, 6, 8, 12, 7, 11, 13, 14, 16, 15, 18], [2, 1, 4, 5, 6, 8, 12, 7, 11, 13, 14, 16, 15, 18, 17], [2, 1, 4, 5, 6, 8, 12, 7, 11, 13, 1

## IV- NON-LINEAR REGRESSION

In [51]:
train_poly = add_polynomial_feature(train,[i for i in range(1,len(train.columns))] ,[j for j in range(2,5)])
test_poly = add_polynomial_feature(test, [i for i in range(1,len(test.columns))] ,[j for j in range(2,5)])
print(train_poly)

        price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
10928  4.4800         2       1.50         1630      3780     1.0         0.0   
1171   3.5900         3       2.50         1450      3850     2.0         0.0   
6401   4.7000         3       1.00         1460      8227     1.0         0.0   
12572  6.4800         5       2.25         2410     12000     2.0         0.0   
2499   5.6000         4       1.75         2150      8555     1.0         0.0   
...       ...       ...        ...          ...       ...     ...         ...   
13633  4.1850         3       2.00         1800     12440     1.0         0.0   
11496  5.1000         3       2.25         2750    219542     2.0         0.0   
10414  3.8900         3       1.50         2030     10075     1.0         0.0   
871    4.6995         3       2.25         1620      8701     1.0         0.0   
7738   2.7500         4       1.00         1770      7345     1.5         0.0   

       view  condition  gra

In [52]:
# 1- POLYNOMIAL REGRESSION (By Generalisation error only)
#By strict stopping criterion
select_strict = foward_selection_gen(train_poly,[i for i in range(1,len(train_poly.columns))],0)

#By delta iteration
select_delta = delta_foward_selection_gen(train_poly,[i for i in range(1,len(train_poly.columns))],0)

#By continuous criterion
select_continuous = c_foward_selection_gen(train_poly,[i for i in range(1,len(train_poly.columns))],0)

gens=[generalization_error_split(train_poly,test_poly,select_strict,0),generalization_error_split(train_poly,test_poly,select_delta,0),generalization_error_split(train_poly,test_poly,select_continuous,0)]
vars = [select_strict,select_delta,select_continuous]
index_min = np.argmin(gens)
selectpolynomial=vars[index_min]
print(selectpolynomial)
print(generalization_error_split(train_poly,test_poly,selectpolynomial,0))

{5.128364934326332: [25], 4.182052198434985: [25, 15], 3.6985566676584685: [25, 15, 45], 3.3612865368461224: [25, 15, 45, 12], 3.168652704013481: [25, 15, 45, 12, 37], 3.082112731402216: [25, 15, 45, 12, 37, 63], 2.765884901284438: [25, 15, 45, 12, 37, 63, 62], 2.6765921733951523: [25, 15, 45, 12, 37, 63, 62, 6], 2.6200897423492644: [25, 15, 45, 12, 37, 63, 62, 6, 23], 2.5796879320205153: [25, 15, 45, 12, 37, 63, 62, 6, 23, 8], 2.541192237940801: [25, 15, 45, 12, 37, 63, 62, 6, 23, 8, 17], 2.502728205979586: [25, 15, 45, 12, 37, 63, 62, 6, 23, 8, 17, 65], 2.4587626235339743: [25, 15, 45, 12, 37, 63, 62, 6, 23, 8, 17, 65, 14], 2.434545552541056: [25, 15, 45, 12, 37, 63, 62, 6, 23, 8, 17, 65, 14, 13], 2.4208100569971456: [25, 15, 45, 12, 37, 63, 62, 6, 23, 8, 17, 65, 14, 13, 49], 2.4034279114256067: [25, 15, 45, 12, 37, 63, 62, 6, 23, 8, 17, 65, 14, 13, 49, 46], 2.3946839122326997: [25, 15, 45, 12, 37, 63, 62, 6, 23, 8, 17, 65, 14, 13, 49, 46, 67], 2.378847001564399: [25, 15, 45, 12, 37,

In [53]:
poly_houses=add_polynomial_feature(houses,[i for i in range(1,len(houses.columns))] ,[j for j in range(2,5)])
poly_houses_new=add_polynomial_feature(houses_new,[i for i in range(0,len(houses_new.columns))] ,[j for j in range(2,5)])
modelpolynomial = my_regression(poly_houses,selectpolynomial,0)
modelpolynomial.summary()


0,1,2,3
Dep. Variable:,price,R-squared:,0.781
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,1534.0
Date:,"Tue, 02 May 2023",Prob (F-statistic):,0.0
Time:,01:11:02,Log-Likelihood:,-26478.0
No. Observations:,13397,AIC:,53020.0
Df Residuals:,13365,BIC:,53260.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.901e+04,1012.136,38.546,0.000,3.7e+04,4.1e+04
sqft_lot,9.751e-07,4.06e-07,2.401,0.016,1.79e-07,1.77e-06
floors,-0.3226,0.100,-3.236,0.001,-0.518,-0.127
waterfront,6.5298,0.229,28.516,0.000,6.081,6.979
view,3.3408,0.555,6.017,0.000,2.252,4.429
condition,0.4116,0.026,15.676,0.000,0.360,0.463
grade,-13.0548,2.552,-5.116,0.000,-18.056,-8.053
sqft_above,0.0007,8.93e-05,8.012,0.000,0.001,0.001
yr_built,-0.5538,0.081,-6.877,0.000,-0.712,-0.396

0,1,2,3
Omnibus:,6700.045,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,875400.213
Skew:,1.396,Prob(JB):,0.0
Kurtosis:,42.502,Cond. No.,2.68e+16


# V- SUMMARY

In [54]:
predictionsimple = my_prediction(modelsimple,houses_new)
predictionsimple
print(predictionsimple)
predictionsimple =  pd.DataFrame({'ID':predictionsimple.index, 'Price':predictionsimple})
predictionsimple.to_csv('my_submission_simple.csv', index=False)

predictionmultiple = my_prediction(modelmultiple,houses_new)
print(predictionmultiple)
predictionmultiple =  pd.DataFrame({'ID':predictionmultiple.index, 'Price':predictionmultiple})
predictionmultiple.to_csv('my_submission_multi.csv', index=False)

predictionrsquaredforward = my_prediction(my_regression(houses,selectedbyrsquared,0),houses_new)
predictionrsquaredforward =  pd.DataFrame({'ID':predictionrsquaredforward.index, 'Price':predictionrsquaredforward})
predictionrsquaredforward.to_csv('my_submission_rsquared.csv', index=False)

predictiongenforward = my_prediction(my_regression(houses,selectedbygen,0),houses_new)
predictiongenforward =  pd.DataFrame({'ID':predictiongenforward.index, 'Price':predictiongenforward})
predictiongenforward.to_csv('my_submission_gen.csv', index=False)

predictionpvalueforward = my_prediction(my_regression(houses,selectedbypvalues,0),houses_new)
predictionpvalueforward =  pd.DataFrame({'ID':predictionpvalueforward.index, 'Price':predictionpvalueforward})
predictionpvalueforward.to_csv('my_submission_pvalues.csv', index=False)

polyprediction=my_prediction(modelpolynomial,poly_houses_new)
polyprediction =  pd.DataFrame({'ID':polyprediction.index, 'Price':polyprediction})
polyprediction.to_csv('my_submission_poly.csv', index=False)

15529     5.568317
3233      2.996609
14381     4.396761
3201     13.406311
3425      4.911103
           ...    
7989      6.111233
9973      4.796804
9007      3.853845
16345    10.911754
13582     2.196522
Length: 2365, dtype: float64
15529     3.878332
3233      3.338104
14381     6.191181
3201     12.233500
3425      1.996580
           ...    
7989      4.961400
9973      3.593966
9007      4.232455
16345    13.751428
13582     0.376524
Length: 2365, dtype: float64
