In [1]:
##FITTING A REGRESSION MODEL TO TRAINING DATA
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

##Data loading and preprocessing
housing_df = pd.read_csv('../../.venv/lib/Datasets/WestRoxbury.csv')
housing_df.columns = [s.strip().replace(' ', '_') for s in housing_df.columns]
housing_df['REMODEL'] = housing_df['REMODEL'].str.strip() ##strip whitespaces in remodel column
housing_df['REMODEL'] = housing_df['REMODEL'].fillna('None').replace('', 'None') ##replace missing values with none
housing_df = pd.get_dummies(housing_df, prefix_sep='_', drop_first=True)

In [2]:
##Create list of predictors and outcome
excludeColumns = ('TOTAL_VALUE', 'TAX')
predictors = [s for s in housing_df.columns if s not in excludeColumns]
outcome = 'TOTAL_VALUE'

In [3]:
##Partition Data
X = housing_df[predictors]
y = housing_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [4]:
##Implement linear regression model and fit to training data
model = LinearRegression()
model.fit(train_X, train_y)

In [5]:
train_pred = model.predict(train_X)
train_results = pd.DataFrame({
    'TOTAL_VALUE': train_y,
    'predicted': train_pred,
    'residual': train_y - train_pred
})
train_results.head()

Unnamed: 0,TOTAL_VALUE,predicted,residual
2024,392.0,384.95187,7.04813
5140,476.3,431.073414,45.226586
5259,367.4,384.579253,-17.179253
421,350.3,372.446323,-22.146323
1401,348.1,319.312241,28.787759


In [6]:
##APPLYING REGRESSION MODEL TO PREDICT VALIDATION SET
valid_pred = model.predict(valid_X)
valid_results = pd.DataFrame({
    'TOTAL_VALUE': valid_y,
    'predicted': valid_pred,
    'residual': valid_y - valid_pred
})
valid_results.head()

Unnamed: 0,TOTAL_VALUE,predicted,residual
1822,462.0,402.982108,59.017892
1998,370.4,358.914637,11.485363
5126,407.4,385.474466,21.925534
808,316.1,384.594397,-68.494397
4034,393.2,430.560431,-37.360431


In [7]:
##CODE FOR LINEAR REGRESSION MODEL EVALUATION METRICS
##Import utility functions
from dmba import regressionSummary
##training set
regressionSummary(train_results.TOTAL_VALUE, train_results.predicted)
##Validation set
regressionSummary(valid_results.TOTAL_VALUE, valid_results.predicted)

no display found. Using non-interactive Agg backend

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 43.7829
            Mean Absolute Error (MAE) : 33.0033
          Mean Percentage Error (MPE) : -1.1690
Mean Absolute Percentage Error (MAPE) : 8.5972

Regression statistics

                      Mean Error (ME) : -0.0893
       Root Mean Squared Error (RMSE) : 43.0420
            Mean Absolute Error (MAE) : 32.2706
          Mean Percentage Error (MPE) : -1.1355
Mean Absolute Percentage Error (MAPE) : 8.4124


In [8]:
##SCORE MODEL ACCORDING TO NEW DATA
new_data = pd.DataFrame({
    'LOT_SQFT': [4200, 6444, 5035],
    'YR_BUILT': [1960, 1940, 1925],
    'GROSS_AREA': [2670, 2886, 3264],
    'LIVING_AREA': [1710, 1474, 1523],
    'FLOORS': [2.0,1.5,1.9],
    'ROOMS': [10, 6, 6],
    'BEDROOMS': [4, 3, 2],
    'FULL_BATH': [1, 1, 1],
    'HALF_BATH': [1, 1, 0],
    'KITCHEN': [1, 1, 1],
    'FIREPLACE': [1, 1, 0],
    'REMODEL_Old': [0, 0, 0],
    'REMODEL_Recent': [0, 0, 1],
})
print(new_data)

   LOT_SQFT  YR_BUILT  GROSS_AREA  LIVING_AREA  FLOORS  ROOMS  BEDROOMS  \
0      4200      1960        2670         1710     2.0     10         4   
1      6444      1940        2886         1474     1.5      6         3   
2      5035      1925        3264         1523     1.9      6         2   

   FULL_BATH  HALF_BATH  KITCHEN  FIREPLACE  REMODEL_Old  REMODEL_Recent  
0          1          1        1          1            0               0  
1          1          1        1          1            0               0  
2          1          0        1          0            0               1  


In [9]:
##Fit previous model to new data and evaluate the predictions
print('Predictions: ', model.predict(new_data))

Predictions:  [384.99324756 378.87054469 378.43712817]
