In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Installing findspark
!sudo apt-get update

Hit:1 http://us-east-1.ec2.archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://us-east-1.ec2.archive.ubuntu.com/ubuntu jammy-updates InRelease [114 kB]
Get:3 http://us-east-1.ec2.archive.ubuntu.com/ubuntu jammy-backports InRelease [107 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]      
Fetched 331 kB in 0s (689 kB/s)    
Reading package lists... Done


In [3]:
# Installing findspark
!pip install findspark



In [4]:
# Installing pyspark
!pip install pyspark



In [5]:
# Installing java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [6]:
# Installing sdv for data synthesis
!pip install sdv



In [7]:
# Installing spark
!wget -nc https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop2.tgz

File ‘spark-3.3.1-bin-hadoop2.tgz’ already there; not retrieving.



In [8]:
import findspark
findspark.init()

In [9]:
# Creating a spark session
from pyspark.sql import DataFrame, SparkSession
spark = SparkSession.builder.appName("House Price Prediction").getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/05 17:05:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/05 17:05:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [10]:
# Specifying the file path
file_location = "data/train.csv"
file_type = "csv"
# CSV options
infer_schema = True
first_row_is_header = True
delimiter = ","

In [11]:
# The applied options are for CSV files. For other file types, these will be ignored.
DF = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [12]:
# Reviewing the dataset
DF.show(10, False)

23/02/05 17:05:50 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+---

In [13]:
# Printing the info/schema 
DF.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [14]:
# Checking for null values
from pyspark.sql.functions import isnull, when, count, col
DF.select([count(when(isnull(c), c)).alias(c) for c in DF.columns]).show()

+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition

In [15]:
DF.distinct().show()

+----+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
|  Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Conditi

In [16]:
# Replacing NA to null values for imputation
import warnings
warnings.filterwarnings('ignore')
def fixing_null_values(DF, col_name):
  if col_name != 'MasVnrType' and col_name != 'Electrical':
    new_DF = DF.withColumn(col_name, when((col(col_name)=='NA'), None).otherwise(col(col_name)).cast("float"))
  else:
    new_DF = DF.withColumn(col_name, when((col(col_name)=='NA'), None).otherwise(col(col_name)))
  return new_DF

DF2 = fixing_null_values(DF, 'LotFrontage')
DF3 = fixing_null_values(DF2, 'MasVnrType')
DF4 = fixing_null_values(DF3, 'MasVnrArea')
DF5 = fixing_null_values(DF4, 'Electrical')
DF6 = fixing_null_values(DF5, 'GarageYrBlt')
DF6.show()

+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition

In [17]:
# Confirming that there are null values in the dataset
DF6.distinct().show()

+----+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
|  Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Conditi

In [18]:
from pyspark.sql.functions import isnull, when, count, col
DF6.select([count(when(isnull(c), c)).alias(c) for c in DF6.columns]).show()

+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition

In [19]:
# Null value imputation
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['LotFrontage', 'MasVnrArea',  'GarageYrBlt'], outputCols=["{}_imputed".format(c) for c in ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']]).setStrategy("median")

In [20]:
imputer.fit(DF6).transform(DF6).show()

+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+-------------------+------------------+-------------------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Util

In [21]:
# Replacing the null values in categorical columns
DF7 = DF6.na.fill("None",["MasVnrType"]).na.fill("SBrkr",["Electrical"])

In [22]:
# Dropping these columns post null value imputation
DF8 = DF7.drop('LotFrontage', 'MasVnrArea',  'GarageYrBlt')

In [23]:
DF8.show()

+---+----------+--------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
| Id|MSSubClass|MSZoning|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition2|BldgType|HouseStyle|OverallQual|OverallCond|Y

In [24]:
# Checking for null values one last time - no null values found
from pyspark.sql.functions import isnull, when, count, col
DF8.select([count(when(isnull(c), c)).alias(c) for c in DF8.columns]).show()

+---+----------+--------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
| Id|MSSubClass|MSZoning|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition2|BldgType|HouseStyle|OverallQual|OverallCond|Y

In [25]:
# Checking for duplicate values - no duplicate values found
DF8.groupBy(DF8.columns).count().filter("count > 1").show()

+---+----------+--------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+-----+
| Id|MSSubClass|MSZoning|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition2|BldgType|HouseStyle|OverallQual|Overall

In [26]:
pandasDF = DF8.toPandas()
print(pandasDF)

        Id  MSSubClass MSZoning  LotArea Street Alley LotShape LandContour  \
0        1          60       RL     8450   Pave    NA      Reg         Lvl   
1        2          20       RL     9600   Pave    NA      Reg         Lvl   
2        3          60       RL    11250   Pave    NA      IR1         Lvl   
3        4          70       RL     9550   Pave    NA      IR1         Lvl   
4        5          60       RL    14260   Pave    NA      IR1         Lvl   
...    ...         ...      ...      ...    ...   ...      ...         ...   
1455  1456          60       RL     7917   Pave    NA      Reg         Lvl   
1456  1457          20       RL    13175   Pave    NA      Reg         Lvl   
1457  1458          70       RL     9042   Pave    NA      Reg         Lvl   
1458  1459          20       RL     9717   Pave    NA      Reg         Lvl   
1459  1460          20       RL     9937   Pave    NA      Reg         Lvl   

     Utilities LotConfig  ... PoolArea PoolQC  Fence MiscFeatur

In [27]:
from sdv.tabular import GaussianCopula
model = GaussianCopula()
model.fit(pandasDF)

In [28]:
sample = model.sample(20000)
sample.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1408,43,RL,18116,Pave,,IR1,Lvl,AllPub,Inside,...,34,,,,12,3,2009,WD,Normal,174167
1,351,73,RL,7687,Pave,,IR2,Lvl,AllPub,Corner,...,15,,,,1324,4,2007,WD,Normal,142248
2,312,95,RL,4020,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,406,7,2006,WD,Normal,88813
3,1077,46,RL,4503,Pave,,Reg,Lvl,AllPub,Inside,...,11,,,,183,4,2008,WD,Partial,88861
4,773,27,RL,19858,Pave,,IR1,Lvl,AllPub,Inside,...,13,,,,925,2,2008,New,Normal,424470


In [29]:
import pandas as pd
pandasDF = pd.concat([pandasDF, sample], axis=0)

In [30]:
pandasDF["Id"]

0           1
1           2
2           3
3           4
4           5
         ... 
19995     340
19996     140
19997    1364
19998     165
19999    1020
Name: Id, Length: 21460, dtype: int32

In [31]:
pandasDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21460 entries, 0 to 19999
Data columns (total 78 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             21460 non-null  int32 
 1   MSSubClass     21460 non-null  int32 
 2   MSZoning       21460 non-null  object
 3   LotArea        21460 non-null  int32 
 4   Street         21460 non-null  object
 5   Alley          21460 non-null  object
 6   LotShape       21460 non-null  object
 7   LandContour    21460 non-null  object
 8   Utilities      21460 non-null  object
 9   LotConfig      21460 non-null  object
 10  LandSlope      21460 non-null  object
 11  Neighborhood   21460 non-null  object
 12  Condition1     21460 non-null  object
 13  Condition2     21460 non-null  object
 14  BldgType       21460 non-null  object
 15  HouseStyle     21460 non-null  object
 16  OverallQual    21460 non-null  int32 
 17  OverallCond    21460 non-null  int32 
 18  YearBuilt      21460 non-n

In [32]:
DF = spark.createDataFrame(pandasDF) 

In [33]:
DF.columns

['Id',
 'MSSubClass',
 'MSZoning',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch

In [34]:
DF.summary()

23/02/05 17:06:20 WARN TaskSetManager: Stage 28 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

DataFrame[summary: string, Id: string, MSSubClass: string, MSZoning: string, LotArea: string, Street: string, Alley: string, LotShape: string, LandContour: string, Utilities: string, LotConfig: string, LandSlope: string, Neighborhood: string, Condition1: string, Condition2: string, BldgType: string, HouseStyle: string, OverallQual: string, OverallCond: string, YearBuilt: string, YearRemodAdd: string, RoofStyle: string, RoofMatl: string, Exterior1st: string, Exterior2nd: string, MasVnrType: string, ExterQual: string, ExterCond: string, Foundation: string, BsmtQual: string, BsmtCond: string, BsmtExposure: string, BsmtFinType1: string, BsmtFinSF1: string, BsmtFinType2: string, BsmtFinSF2: string, BsmtUnfSF: string, TotalBsmtSF: string, Heating: string, HeatingQC: string, CentralAir: string, Electrical: string, 1stFlrSF: string, 2ndFlrSF: string, LowQualFinSF: string, GrLivArea: string, BsmtFullBath: string, BsmtHalfBath: string, FullBath: string, HalfBath: string, BedroomAbvGr: string, Ki

In [35]:
columnList = [item[0] for item in DF.dtypes if item[1].startswith('string')]

In [36]:
columnList

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [37]:
output_column_list = list(map(lambda x: x+"_index", columnList))

In [38]:
output_column_list

['MSZoning_index',
 'Street_index',
 'Alley_index',
 'LotShape_index',
 'LandContour_index',
 'Utilities_index',
 'LotConfig_index',
 'LandSlope_index',
 'Neighborhood_index',
 'Condition1_index',
 'Condition2_index',
 'BldgType_index',
 'HouseStyle_index',
 'RoofStyle_index',
 'RoofMatl_index',
 'Exterior1st_index',
 'Exterior2nd_index',
 'MasVnrType_index',
 'ExterQual_index',
 'ExterCond_index',
 'Foundation_index',
 'BsmtQual_index',
 'BsmtCond_index',
 'BsmtExposure_index',
 'BsmtFinType1_index',
 'BsmtFinType2_index',
 'Heating_index',
 'HeatingQC_index',
 'CentralAir_index',
 'Electrical_index',
 'KitchenQual_index',
 'Functional_index',
 'FireplaceQu_index',
 'GarageType_index',
 'GarageFinish_index',
 'GarageQual_index',
 'GarageCond_index',
 'PavedDrive_index',
 'PoolQC_index',
 'Fence_index',
 'MiscFeature_index',
 'SaleType_index',
 'SaleCondition_index']

In [39]:
# String indexer
from pyspark.ml.feature import StringIndexer, OneHotEncoder
indexers = StringIndexer(inputCols=columnList, 
                         outputCols=output_column_list)
strindexedDF = indexers.fit(DF).transform(DF)
DF9 = strindexedDF.select("*")


23/02/05 17:06:56 WARN TaskSetManager: Stage 31 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [40]:
DF9.show()

23/02/05 17:06:58 WARN TaskSetManager: Stage 34 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+---+----------+--------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+--------------+------------+--------

In [41]:
# Dropping columns post String Indexing
DF10 = DF9.drop(*columnList)

In [42]:
DF10.printSchema()

root
 |-- Id: long (nullable = true)
 |-- MSSubClass: long (nullable = true)
 |-- LotArea: long (nullable = true)
 |-- OverallQual: long (nullable = true)
 |-- OverallCond: long (nullable = true)
 |-- YearBuilt: long (nullable = true)
 |-- YearRemodAdd: long (nullable = true)
 |-- BsmtFinSF1: long (nullable = true)
 |-- BsmtFinSF2: long (nullable = true)
 |-- BsmtUnfSF: long (nullable = true)
 |-- TotalBsmtSF: long (nullable = true)
 |-- 1stFlrSF: long (nullable = true)
 |-- 2ndFlrSF: long (nullable = true)
 |-- LowQualFinSF: long (nullable = true)
 |-- GrLivArea: long (nullable = true)
 |-- BsmtFullBath: long (nullable = true)
 |-- BsmtHalfBath: long (nullable = true)
 |-- FullBath: long (nullable = true)
 |-- HalfBath: long (nullable = true)
 |-- BedroomAbvGr: long (nullable = true)
 |-- KitchenAbvGr: long (nullable = true)
 |-- TotRmsAbvGrd: long (nullable = true)
 |-- Fireplaces: long (nullable = true)
 |-- GarageCars: long (nullable = true)
 |-- GarageArea: long (nullable = true)


In [43]:
DF10.show()

23/02/05 17:06:58 WARN TaskSetManager: Stage 35 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+---+----------+-------+-----------+-----------+---------+------------+----------+----------+---------+-----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+------------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+-------+------+------+---------+--------------+------------+-----------+--------------+-----------------+---------------+---------------+---------------+------------------+----------------+----------------+--------------+----------------+---------------+--------------+-----------------+-----------------+----------------+---------------+---------------+----------------+--------------+--------------+------------------+------------------+------------------+-------------+---------------+----------------+-------------

In [44]:
DF10.columns

['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice',
 'MSZoning_index',
 'Street_index',
 'Alley_index',
 'LotShape_index',
 'LandContour_index',
 'Utilities_index',
 'LotConfig_index',
 'LandSlope_index',
 'Neighborhood_index',
 'Condition1_index',
 'Condition2_index',
 'BldgType_index',
 'HouseStyle_index',
 'RoofStyle_index',
 'RoofMatl_index',
 'Exterior1st_index',
 'Exterior2nd_index',
 'MasVnrType_index',
 'ExterQual_index',
 'ExterCond_index',
 'Foundation_index',
 'BsmtQual_index',
 'BsmtCond_index',
 'BsmtExposure_index',
 'Bsmt

In [45]:
input_cols = ['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [46]:
output_cols = list(map(lambda x: x+"_scaled", input_cols))

In [47]:
# Vectorizing the independent features
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=input_cols,outputCol="Independent_Features")

In [48]:
DF11 = featureassembler.transform(DF10)

In [49]:
DF11.show()

23/02/05 17:06:59 WARN TaskSetManager: Stage 36 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+---+----------+-------+-----------+-----------+---------+------------+----------+----------+---------+-----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+------------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+-------+------+------+---------+--------------+------------+-----------+--------------+-----------------+---------------+---------------+---------------+------------------+----------------+----------------+--------------+----------------+---------------+--------------+-----------------+-----------------+----------------+---------------+---------------+----------------+--------------+--------------+------------------+------------------+------------------+-------------+---------------+----------------+-------------

In [50]:
# Import StandardScaler from pyspark.ml.feature package
from pyspark.ml.feature import StandardScaler

# Create the StandardScaler object. It only take feature column (dense vector)
stdscaler = StandardScaler(inputCol="Independent_Features", outputCol="Scaled_Features")

# Fit the StandardScaler object on the output of the dense vector data and transform
stdscaledDF = stdscaler.fit(DF11).transform(DF11)
stdscaledDF.select("*").show()

23/02/05 17:07:00 WARN TaskSetManager: Stage 37 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:01 WARN TaskSetManager: Stage 40 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+---+----------+-------+-----------+-----------+---------+------------+----------+----------+---------+-----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+------------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+-------+------+------+---------+--------------+------------+-----------+--------------+-----------------+---------------+---------------+---------------+------------------+----------------+----------------+--------------+----------------+---------------+--------------+-----------------+-----------------+----------------+---------------+---------------+----------------+--------------+--------------+------------------+------------------+------------------+-------------+---------------+----------------+-------------

In [51]:
DF12 = stdscaledDF.drop(*input_cols, "Independent_Features")

In [52]:
DF12.show()

23/02/05 17:07:01 WARN TaskSetManager: Stage 41 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+---+---------+--------------+------------+-----------+--------------+-----------------+---------------+---------------+---------------+------------------+----------------+----------------+--------------+----------------+---------------+--------------+-----------------+-----------------+----------------+---------------+---------------+----------------+--------------+--------------+------------------+------------------+------------------+-------------+---------------+----------------+----------------+-----------------+----------------+-----------------+----------------+------------------+----------------+----------------+----------------+------------+-----------+-----------------+--------------+-------------------+--------------------+
| Id|SalePrice|MSZoning_index|Street_index|Alley_index|LotShape_index|LandContour_index|Utilities_index|LotConfig

In [53]:
DF12.columns

['Id',
 'SalePrice',
 'MSZoning_index',
 'Street_index',
 'Alley_index',
 'LotShape_index',
 'LandContour_index',
 'Utilities_index',
 'LotConfig_index',
 'LandSlope_index',
 'Neighborhood_index',
 'Condition1_index',
 'Condition2_index',
 'BldgType_index',
 'HouseStyle_index',
 'RoofStyle_index',
 'RoofMatl_index',
 'Exterior1st_index',
 'Exterior2nd_index',
 'MasVnrType_index',
 'ExterQual_index',
 'ExterCond_index',
 'Foundation_index',
 'BsmtQual_index',
 'BsmtCond_index',
 'BsmtExposure_index',
 'BsmtFinType1_index',
 'BsmtFinType2_index',
 'Heating_index',
 'HeatingQC_index',
 'CentralAir_index',
 'Electrical_index',
 'KitchenQual_index',
 'Functional_index',
 'FireplaceQu_index',
 'GarageType_index',
 'GarageFinish_index',
 'GarageQual_index',
 'GarageCond_index',
 'PavedDrive_index',
 'PoolQC_index',
 'Fence_index',
 'MiscFeature_index',
 'SaleType_index',
 'SaleCondition_index',
 'Scaled_Features']

In [54]:
type(DF12)

pyspark.sql.dataframe.DataFrame

In [55]:
DF12.columns

['Id',
 'SalePrice',
 'MSZoning_index',
 'Street_index',
 'Alley_index',
 'LotShape_index',
 'LandContour_index',
 'Utilities_index',
 'LotConfig_index',
 'LandSlope_index',
 'Neighborhood_index',
 'Condition1_index',
 'Condition2_index',
 'BldgType_index',
 'HouseStyle_index',
 'RoofStyle_index',
 'RoofMatl_index',
 'Exterior1st_index',
 'Exterior2nd_index',
 'MasVnrType_index',
 'ExterQual_index',
 'ExterCond_index',
 'Foundation_index',
 'BsmtQual_index',
 'BsmtCond_index',
 'BsmtExposure_index',
 'BsmtFinType1_index',
 'BsmtFinType2_index',
 'Heating_index',
 'HeatingQC_index',
 'CentralAir_index',
 'Electrical_index',
 'KitchenQual_index',
 'Functional_index',
 'FireplaceQu_index',
 'GarageType_index',
 'GarageFinish_index',
 'GarageQual_index',
 'GarageCond_index',
 'PavedDrive_index',
 'PoolQC_index',
 'Fence_index',
 'MiscFeature_index',
 'SaleType_index',
 'SaleCondition_index',
 'Scaled_Features']

In [56]:
# Import VectorAssembler from pyspark.ml.feature package
from pyspark.ml.feature import VectorAssembler
# Create a list of all the variables that you want to create feature vectors
# These features are then further used for training model
features_col = ['MSZoning_index',
 'Street_index',
 'Alley_index',
 'LotShape_index',
 'LandContour_index',
 'Utilities_index',
 'LotConfig_index',
 'LandSlope_index',
 'Neighborhood_index',
 'Condition1_index',
 'Condition2_index',
 'BldgType_index',
 'HouseStyle_index',
 'RoofStyle_index',
 'RoofMatl_index',
 'Exterior1st_index',
 'Exterior2nd_index',
 'MasVnrType_index',
 'ExterQual_index',
 'ExterCond_index',
 'Foundation_index',
 'BsmtQual_index',
 'BsmtCond_index',
 'BsmtExposure_index',
 'BsmtFinType1_index',
 'BsmtFinType2_index',
 'Heating_index',
 'HeatingQC_index',
 'CentralAir_index',
 'Electrical_index',
 'KitchenQual_index',
 'Functional_index',
 'FireplaceQu_index',
 'GarageType_index',
 'GarageFinish_index',
 'GarageQual_index',
 'GarageCond_index',
 'PavedDrive_index',
 'PoolQC_index',
 'Fence_index',
 'MiscFeature_index',
 'SaleType_index',
 'SaleCondition_index',
 'Scaled_Features']
# Create the VectorAssembler object
assembler = VectorAssembler(inputCols=features_col, outputCol="independent_features")
DF13 = assembler.transform(DF12)
final_DF = DF13.select('independent_features', 'SalePrice')
final_DF_without_sp = DF13.select('independent_features')

In [57]:
final_DF.show()

23/02/05 17:07:02 WARN TaskSetManager: Stage 42 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+--------------------+---------+
|independent_features|SalePrice|
+--------------------+---------+
|(76,[8,12,17,18,2...|   208500|
|(76,[6,8,9,15,16,...|   181500|
|(76,[3,8,12,17,18...|   223500|
|(76,[3,6,8,12,15,...|   140000|
|(76,[3,6,8,12,17,...|   250000|
|(76,[3,8,12,20,21...|   143000|
|(76,[8,17,18,20,2...|   307000|
|(76,[3,6,8,9,12,1...|   200000|
|(76,[0,8,9,12,15,...|   129900|
|(76,[6,8,9,10,11,...|   118000|
|(76,[8,13,15,16,2...|   129500|
|(76,[3,8,12,13,15...|   345000|
|(76,[3,8,13,15,16...|   144000|
|(76,[3,8,17,18,20...|   279500|
|(76,[3,6,13,15,16...|   157000|
|(76,[0,6,8,12,15,...|   132000|
|(76,[3,6,15,16,17...|   149000|
|(76,[8,11,15,16,2...|    90000|
|(76,[8,9,20,30,33...|   159000|
|(76,[13,15,16,24,...|   139000|
+--------------------+---------+
only showing top 20 rows



In [58]:
# Train - test split
# We spilt the data into 70-30 set
# Training Set - 70% obesevations
# Testing Set - 30% observations
trainDF, testDF =  final_DF.randomSplit([0.7,0.3], seed = 42)

# print the count of observations in each set
print("Observations in training set = ", trainDF.count())
print("Observations in testing set = ", testDF.count())

23/02/05 17:07:02 WARN TaskSetManager: Stage 43 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

Observations in training set =  15163
23/02/05 17:07:04 WARN TaskSetManager: Stage 46 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


[Stage 46:>                                                         (0 + 2) / 2]

Observations in testing set =  6297


                                                                                

In [59]:
# Linear Regression
from pyspark.ml.regression import LinearRegression
regressor=LinearRegression(featuresCol='independent_features', labelCol='SalePrice')
lr_model=regressor.fit(trainDF)

23/02/05 17:07:05 WARN Instrumentation: [566a3ada] regParam is zero, which might cause numerical instability and overfitting.
23/02/05 17:07:05 WARN TaskSetManager: Stage 49 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


[Stage 49:>                                                         (0 + 2) / 2]                                                                                

23/02/05 17:07:06 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/02/05 17:07:06 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/02/05 17:07:06 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
23/02/05 17:07:06 WARN Instrumentation: [566a3ada] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
23/02/05 17:07:06 WARN TaskSetManager: Stage 50 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


[Stage 50:>                                                         (0 + 2) / 2]                                                                                

In [60]:
# Coefficients
lr_model.coefficients

DenseVector([-4291.4193, -31945.9108, 4782.5362, -228.6162, -2056.0086, 0.0, -680.1203, -847.6024, 511.149, -3110.9392, -8643.8307, -216.7968, 920.0321, 2026.0974, -480.1417, 1145.9896, -1269.1715, 1689.4456, 7144.7525, -302.6631, 2280.5995, 5400.3236, -265.7989, 2683.6527, -15.2069, 680.3132, -676.5423, -186.4936, -1819.1989, -997.3479, 6765.1922, -2578.9566, 405.9947, 395.3111, 2746.1192, 1390.7654, 289.583, 1125.6178, 10042.1634, -1026.8634, 2315.4076, 1814.6172, -1425.9112, -5324.2007, 6206.6836, 18570.2219, 6701.6294, 11456.6647, 2940.8505, 5360.2406, -1381.931, -2753.3267, 9299.2581, -1660.5946, -3889.9947, -1373.1651, 31501.1271, 1646.7994, -446.5343, 2387.0482, 1104.8995, -3757.3306, -1289.3614, 2941.9449, 2368.6641, 4349.1175, 3031.629, 1500.0838, -1170.5002, 799.3312, 420.5817, 3650.2159, -2633.6798, -129.4327, -142.7281, -1016.7646])

In [61]:
# Intercepts
lr_model.intercept

489229.2352229924

In [62]:
# Prediction
pred_results_lr=lr_model.evaluate(testDF)

23/02/05 17:07:07 WARN TaskSetManager: Stage 51 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


[Stage 51:>                                                         (0 + 2) / 2]                                                                                

In [63]:
pred_results_lr.predictions.show()

23/02/05 17:07:08 WARN TaskSetManager: Stage 52 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+--------------------+---------+------------------+
|independent_features|SalePrice|        prediction|
+--------------------+---------+------------------+
|(76,[0,1,4,8,15,1...|    81000|  60142.0059989795|
|(76,[0,2,3,6,8,12...|   265979|261722.26276450203|
|(76,[0,2,3,6,12,1...|    84967|145413.96088471607|
|(76,[0,2,3,8,9,12...|    89500|126058.34276704019|
|(76,[0,2,3,8,11,1...|   196000|201984.73244801577|
|(76,[0,2,3,8,11,1...|   197000|229782.56173038454|
|(76,[0,2,3,8,11,1...|   135387|170228.72357480339|
|(76,[0,2,3,8,12,1...|    74512|  77731.7108701758|
|(76,[0,2,4,7,8,11...|    93000|116276.18922505033|
|(76,[0,2,4,8,9,11...|   200000|231718.64930176496|
|(76,[0,2,4,8,9,12...|   115000|120934.24022642168|
|(76,[0,2,6,8,9,12...|   168000|190658.32308121672|
|(76,[0,2,6,8,9,12...|   124000| 142779.9846316913|
|(76,[0,2,6,8,11,1...|   1

In [64]:
# Printing metrics
print("Linear Regression - RMSE: ",pred_results_lr.meanSquaredError)
print("Linear Regression - MAE: ",pred_results_lr.meanAbsoluteError)
print("Linear Regression - R-squared: ",pred_results_lr.r2)

Linear Regression - RMSE:  1160804080.1256924
Linear Regression - MAE:  26315.61537304066
Linear Regression - R-squared:  0.8193619300677886


In [65]:
# Saving the metrics to a file
with open("data/train_data_metrics.csv",'a') as f:
    print("Training Dataset Metrics ", file=f)
    print("Linear Regression - RMSE: ",pred_results_lr.meanSquaredError, file=f)
    print("Linear Regression - MAE: ",pred_results_lr.meanAbsoluteError, file=f)
    print("Linear Regression - R-squared: ",pred_results_lr.r2, file=f)

In [66]:
# Random Forest Regression
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol='independent_features', labelCol='SalePrice',numTrees=20, maxDepth=8)
rf.setSeed(17)
rf_model = rf.fit(trainDF)

23/02/05 17:07:09 WARN TaskSetManager: Stage 53 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


[Stage 53:>                                                         (0 + 1) / 1]                                                                                

23/02/05 17:07:10 WARN TaskSetManager: Stage 54 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:11 WARN TaskSetManager: Stage 55 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:12 WARN TaskSetManager: Stage 57 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:14 WARN TaskSetManager: Stage 59 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:14 WARN TaskSetManager: Stage 61 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:15 WARN TaskSetManager: Stage 63 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:15 WARN TaskSetManager: Stage 65 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:16 WARN TaskSetManager: Stage 67 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:17 WARN TaskSetManager: Stage 69 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:18 WARN DAGScheduler: Broadcasting large task binary with size 1349.6 KiB
23/02/05 17:07:18 WARN TaskSetManager: Stage 71 contains a 

                                                                                

In [67]:
pred_results_rf=rf_model.transform(testDF)



In [68]:
pred_results_rf.show()

23/02/05 17:07:20 WARN TaskSetManager: Stage 73 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+--------------------+---------+------------------+
|independent_features|SalePrice|        prediction|
+--------------------+---------+------------------+
|(76,[0,1,4,8,15,1...|    81000| 82932.83690149494|
|(76,[0,2,3,6,8,12...|   265979| 261388.4897233137|
|(76,[0,2,3,6,12,1...|    84967| 155745.3802926768|
|(76,[0,2,3,8,9,12...|    89500|132997.15042218514|
|(76,[0,2,3,8,11,1...|   196000|  212114.666218738|
|(76,[0,2,3,8,11,1...|   197000|241370.57865442554|
|(76,[0,2,3,8,11,1...|   135387|154052.70171450078|
|(76,[0,2,3,8,12,1...|    74512|63408.511391466214|
|(76,[0,2,4,7,8,11...|    93000| 93489.32287163101|
|(76,[0,2,4,8,9,11...|   200000|203143.33317987906|
|(76,[0,2,4,8,9,12...|   115000|131066.95204255279|
|(76,[0,2,6,8,9,12...|   168000| 202814.5511250876|
|(76,[0,2,6,8,9,12...|   124000|142585.76494619186|
|(76,[0,2,6,8,11,1...|   1

In [69]:
# Gradient Boost Regression
from pyspark.ml.regression import GBTRegressor
gbtr = GBTRegressor(featuresCol='independent_features', labelCol='SalePrice', maxIter=17)
gbr_model = gbtr.fit(trainDF)

23/02/05 17:07:21 WARN TaskSetManager: Stage 74 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:21 WARN TaskSetManager: Stage 75 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


[Stage 75:>                                                         (0 + 2) / 2]                                                                                

23/02/05 17:07:22 WARN TaskSetManager: Stage 76 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:23 WARN TaskSetManager: Stage 78 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:24 WARN TaskSetManager: Stage 80 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:24 WARN TaskSetManager: Stage 82 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:25 WARN TaskSetManager: Stage 84 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:25 WARN TaskSetManager: Stage 86 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:25 WARN TaskSetManager: Stage 88 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:25 WARN TaskSetManager: Stage 90 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:25 WARN TaskSetManager: Stage 92 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/

23/02/05 17:07:31 WARN TaskSetManager: Stage 198 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:32 WARN TaskSetManager: Stage 200 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:32 WARN TaskSetManager: Stage 202 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:32 WARN TaskSetManager: Stage 204 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:32 WARN TaskSetManager: Stage 206 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:32 WARN TaskSetManager: Stage 208 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
23/02/05 17:07:32 WARN TaskSetManager: Stage 210 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.

In [70]:
pred_results_gb = gbr_model.transform(testDF)

In [71]:
pred_results_gb.show()

23/02/05 17:07:34 WARN TaskSetManager: Stage 248 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.
+--------------------+---------+------------------+
|independent_features|SalePrice|        prediction|
+--------------------+---------+------------------+
|(76,[0,1,4,8,15,1...|    81000|  55747.0691627625|
|(76,[0,2,3,6,8,12...|   265979| 228310.6221244872|
|(76,[0,2,3,6,12,1...|    84967| 138428.9994549804|
|(76,[0,2,3,8,9,12...|    89500|124022.85776153856|
|(76,[0,2,3,8,11,1...|   196000|200669.64673484946|
|(76,[0,2,3,8,11,1...|   197000| 254942.2024614788|
|(76,[0,2,3,8,11,1...|   135387|135015.13830454944|
|(76,[0,2,3,8,12,1...|    74512| 73157.38079330922|
|(76,[0,2,4,7,8,11...|    93000|  72507.0180294814|
|(76,[0,2,4,8,9,11...|   200000|250279.20543797468|
|(76,[0,2,4,8,9,12...|   115000|124563.77830430432|
|(76,[0,2,6,8,9,12...|   168000| 214854.2285138366|
|(76,[0,2,6,8,9,12...|   124000|117456.85325344128|
|(76,[0,2,6,8,11,1...|   

[Stage 248:>                                                        (0 + 1) / 1]                                                                                

In [72]:
# Evaluating metrics for RF and GB regression algorithms
from pyspark.ml.evaluation import RegressionEvaluator
def metrics(df, model_name):
  rmse=RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="rmse")
  rmse=rmse.evaluate(df) 
 
  mae=RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="mae")
  mae=mae.evaluate(df) 
 
  r2=RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="r2")
  r2=r2.evaluate(df)

  with open("Train_data_metrics5.csv",'a') as f:
    print("{0} - RMSE: {1}".format(model_name,rmse), file=f)
    print("{0} - MAE: {1}".format(model_name,mae), file=f)
    print("{0} - R-squared: {1}".format(model_name,r2), file=f)

metrics(pred_results_rf, "Random Forest Regression")
metrics(pred_results_gb, "Gradient Boost Regression")

23/02/05 17:07:35 WARN TaskSetManager: Stage 249 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:36 WARN TaskSetManager: Stage 250 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:37 WARN TaskSetManager: Stage 251 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/02/05 17:07:38 WARN TaskSetManager: Stage 252 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.




23/02/05 17:07:39 WARN TaskSetManager: Stage 253 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


[Stage 253:>                                                        (0 + 2) / 2]                                                                                

23/02/05 17:07:40 WARN TaskSetManager: Stage 254 contains a task of very large size (2099 KiB). The maximum recommended task size is 1000 KiB.


[Stage 254:>                                                        (0 + 2) / 2]                                                                                

In [73]:
# Saving the Models
lr_model.write().overwrite().save("generated_models/linearmodel")
rf_model.write().overwrite().save("generated_models/rfmodel")
gbr_model.write().overwrite().save("generated_models/gbrmodel")

[Stage 258:>                                                        (0 + 1) / 1]                                                                                