In [1]:
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
# Load packages for Machine Learning
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import KFold

In [2]:
# Load data set and create subset to perform Machine Learning on
data_apart = pd.read_csv('/Users/sarahkoelemij/Documents/Social Data Science/Projekt/final_data.csv', index_col=0)
ML_set = data_apart[['log_sqm_price', 'Municipality', 'Floor', 'Land_area','Rooms', 'Area', 'Owner_expense', 'Energy_saving', 
                     'School_dist', 'Metro_dist', 'Jail_dist', 'Centrum_coor']]
ML_dummy = pd.get_dummies(ML_set, columns=['Municipality'])

X = ML_dummy.iloc[:,1:]
y = ML_set[['log_sqm_price']]

In [3]:
# splitting into development (2/3) and test data (1/3)
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=1/3, random_state=1)
# splitting development into train (1/3) and validation (1/3)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=1/2, random_state=1)

## 3.2 First set polynomial features = 3 and create pipelines: 

In [17]:
###########################
#        LINEAR           #
###########################
pipe_lr = make_pipeline(PolynomialFeatures(degree = 3,include_bias=False), 
                            StandardScaler(),
                            LinearRegression())

###########################
#        Lasso            #
########################### 
pipe_lasso = make_pipeline(PolynomialFeatures(degree=3, include_bias=False), 
                                  StandardScaler(),
                                  Lasso())

###########################
#        LASSO CV         #
###########################
lambdas = np.logspace(-4,4, 12)
kfolds = KFold(n_splits=10)
RMSE_lassoCV = []

for lambda_ in lambdas:
    
    pipe_lassoCV = make_pipeline(PolynomialFeatures(degree=3,include_bias=False), 
                                  StandardScaler(),
                                  Lasso(alpha=lambda_, random_state=1))    
    RMSE_lassoCV_ = []
    
    for train_idx, val_idx in kfolds.split(X_dev, y_dev):
        
        X_train, y_train, = X_dev.iloc[train_idx], y_dev.iloc[train_idx]
        X_val, y_val = X_dev.iloc[val_idx], y_dev.iloc[val_idx] 

        pipe_lassoCV.fit(X_train, y_train)
        RMSE_lassoCV_.append(mse(y_val, pipe_lassoCV.predict(X_val))**(1/2))    
    RMSE_lassoCV.append(RMSE_lassoCV_)

optimalCV = pd.DataFrame(RMSE_lassoCV, index=lambdas).mean(axis=1).nsmallest(1)
print(optimalCV) # This prints optimal lambda and RMSE. 

# Fit training data with optimal lambda
pipe_lassoCV = make_pipeline(PolynomialFeatures(degree=3, include_bias=False), 
                                StandardScaler(),
                                Lasso(alpha=optimalCV.index[0]))

0.002848    0.164743
dtype: float64


### 3.2.2 Fit on development data

In [18]:
pipe_lr.fit(X_dev, y_dev)
pipe_lasso.fit(X_dev,y_dev)
pipe_lassoCV.fit(X_dev,y_dev)

Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=0.002848035868435802, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False))])

### 3.2.3 Get MAE and RMSE from test data and make table of results

In [19]:
# Linear model
MAE_lr = mae(y_test, pipe_lr.predict(X_test))
RMSE_lr = mse(y_test, pipe_lr.predict(X_test))**(1/2)

# Lasso model
MAE_lasso = mae(y_test, pipe_lasso.predict(X_test))
RMSE_lasso = mse(y_test, pipe_lasso.predict(X_test))**(1/2)

# Lasso CV
MAE_lasso_CV = mae(y_test, pipe_lassoCV.predict(X_test))
RMSE_lasso_CV = mse(y_test, pipe_lassoCV.predict(X_test))**(1/2)

# Generate table of results
MAE = [MAE_lr, MAE_lasso, MAE_lasso_CV]
RMSE = [RMSE_lr, RMSE_lasso, RMSE_lasso_CV]

Results = pd.DataFrame({'MAE': MAE, 'RMSE': RMSE}, index=('Linear', 'Lasso', 'Lasso CV'))
Results

Unnamed: 0,MAE,RMSE
Linear,1516391000000000.0,5.393387e+16
Lasso,0.2219334,0.2816792
Lasso CV,0.1215846,0.1631624


## 3.3 Optimize on polynomial features

### 3.3.1 Create pipelines

In [4]:
###########################
#        LINEAR           #
###########################
pol = range(1,6)
perform_lr = []

# First loop over polynomial degrees to find best performance for linear
for dg in pol:
    pipe_lr = make_pipeline(PolynomialFeatures(degree = dg,include_bias=False), 
                           StandardScaler(),
                           LinearRegression())  
    # Fit the training data
    pipe_lr.fit(X_train, y_train)
    perform_lr.append(mse(y_val, pipe_lr.predict(X_val))**(1/2))

optimal_pol_lr = pd.Series(perform_lr,index=pol).nsmallest(1)
# Define pipeline for linear
pipe_lr = make_pipeline(PolynomialFeatures(degree = optimal_pol_lr.index[0],include_bias=False), 
                           StandardScaler(),
                           LinearRegression())

###########################
#        Lasso            #
###########################

perform_lasso = []

# First loop over polynomial degrees to find best performance
for dg in pol:
    pipe_lasso = make_pipeline(PolynomialFeatures(degree = dg,include_bias=False), 
                               StandardScaler(),
                               Lasso()) 
    # Fit the training data
    pipe_lasso.fit(X_train, y_train)
    perform.append(mse(y_val, pipe_lr.predict(X_val))**(1/2))
optimal_pol_lasso = pd.Series(perform_lasso,index=pol).nsmallest(1)
# Define pipeline for lasso
pipe_lasso = make_pipeline(PolynomialFeatures(degree=optimal_pol_lasso.index[0], include_bias=False), 
                              StandardScaler(),
                              Lasso())


###########################
#        LASSO CV         #
###########################

lambdas = np.logspace(-4,4, 12)
kfolds = KFold(n_splits=10)
RMSE_lassoCV = []

for lambda_ in lambdas:
    
    pipe_lassoCV = make_pipeline(PolynomialFeatures(degree=optimal_pol_lasso.index[0],include_bias=False), 
                                  StandardScaler(),
                                  Lasso(alpha=lambda_, random_state=1))    
    RMSE_lassoCV_ = []
    
    for train_idx, val_idx in kfolds.split(X_dev, y_dev):
        
        X_train, y_train, = X_dev.iloc[train_idx], y_dev.iloc[train_idx]
        X_val, y_val = X_dev.iloc[val_idx], y_dev.iloc[val_idx] 

        pipe_lassoCV.fit(X_train, y_train)
        RMSE_lassoCV_.append(mse(y_val, pipe_lassoCV.predict(X_val))**(1/2))    
    RMSE_lassoCV.append(RMSE_lassoCV_)

optimalCV = pd.DataFrame(RMSE_lassoCV, index=lambdas).mean(axis=1).nsmallest(1)
print(optimalCV) # This prints optimal lambda and RMSE. 

# Lasso CV pipeline
pipe_lassoCV = make_pipeline(PolynomialFeatures(degree=optimal_pol_lasso.index[0], include_bias=False), 
                                StandardScaler(),
                                Lasso(alpha=optimalCV.index[0]))

KeyboardInterrupt: 

### 3.3.2 Fit on development data

In [None]:
pipe_lr.fit(X_dev, y_dev)
pipe_lasso.fit(X_dev,y_dev)
pipe_lassoCV.fit(X_dev,y_dev)

### 3.2.3 Get MAE and RMSE from test data and make table of results

In [None]:
# Linear model
MAE_lr = mae(y_test, pipe_lr.predict(X_test))
RMSE_lr = mse(y_test, pipe_lr.predict(X_test))**(1/2)

# Lasso model
MAE_lasso = mae(y_test, pipe_lasso.predict(X_test))
RMSE_lasso = mse(y_test, pipe_lasso.predict(X_test))**(1/2)

# Lasso CV
MAE_lasso_CV = mae(y_test, pipe_lassoCV.predict(X_test))
RMSE_lasso_CV = mse(y_test, pipe_lassoCV.predict(X_test))**(1/2)

# Generate table of results
MAE = [MAE_lr, MAE_lasso, MAE_lasso_CV]
RMSE = [RMSE_lr, RMSE_lasso, RMSE_lasso_CV]

Results2 = pd.DataFrame({'MAE': MAE, 'RMSE': RMSE}, index=('Linear', 'Lasso', 'Lasso CV'))
Results2