In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('data/ames/train.csv')
df.head()

In [None]:
# Let's take a visual look at the data. What does out target column look like?
df['SalePrice'].hist(bins=50)

In [None]:
# We know that the number of rooms affects the price of the house. Does that hold true here as well?
# (One of the variables that denotes rooms is `TotRmsAbvGrd`, i.e. total number of rooms above the ground floor,
#  except the bathrooms)
# What about the living area?
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
df.plot(x='TotRmsAbvGrd', y='SalePrice', kind='scatter', ax=ax[0])
df.plot(x='GrLivArea', y='SalePrice', kind='scatter', ax=ax[1])
plt.tight_layout()

In [None]:
# Let's check which columns are numerical, and which are categorical.
# No choice but to read the documentation ;)
# (But we can do a little hack!)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))

df['MSSubClass'].hist(bins=20, ax=ax[0])
ax[0].set_title('Histogram of MSSubClass')

df['LotFrontage'].hist(bins=50, ax=ax[1])
ax[1].set_title('Histogram of street area around property')

# Unfortunately this doesn't always work

In [None]:
df.columns

In [None]:
# This is going to take time, be patient.
categoricalCols = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
                   'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                   'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                   'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
                   'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
                   'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
                   'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
                   'SaleCondition']
numericalCols  = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
                  'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
                  'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
                  'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

In [None]:
# What if we train on the numerical columns only?
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [None]:
# data cleaning - categorical columns may contain mixed types!
for c in categoricalCols:
    df[c] = df[c].astype(str)
    df[c].fillna(value="", inplace=True)

sdf = spark.createDataFrame(df)
numericalDataset = sdf.select(*numericalCols + ['SalePrice'])

In [None]:
assembler = VectorAssembler(inputCols=[c for c in numericalDataset.columns if c != 'SalePrice'],
                            outputCol='features', handleInvalid='skip')
numDataset = assembler.transform(numericalDataset)

In [None]:
# Repeat this a few times - God doesn't play dice but we have to.

trainData, testData = numDataset.randomSplit([0.7, 0.3])

lr = LinearRegression(featuresCol='features', labelCol='SalePrice')
model = lr.fit(trainData)

summary = model.evaluate(testData)
summary.r2

In [None]:
# Can we do better?

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [None]:
masterDataset = sdf

for c in categoricalCols:
    indexer = StringIndexer(inputCol=c, outputCol=c+'Index')
    model = indexer.fit(masterDataset)
    masterDataset = model.transform(masterDataset)
    
    ohe = OneHotEncoder(inputCol=c+'Index', outputCol=c+'CategoryVec')
    encoder = ohe.fit(masterDataset)
    masterDataset = encoder.transform(masterDataset)

inputCols = [c + 'CategoryVec' for c in categoricalCols] + numericalCols
outputCol = 'features'

assembler = VectorAssembler(inputCols=inputCols, outputCol=outputCol, handleInvalid='skip')
finalDataset = assembler.transform(masterDataset)

In [None]:
trainData, testData = finalDataset.randomSplit([0.7, 0.3])

lr = LinearRegression(featuresCol='features', labelCol='SalePrice')
model = lr.fit(trainData)

summary = model.evaluate(testData)
summary.r2

In [None]:
# Let's pick lambda for L2 regularization

In [None]:
import numpy as np
valid_lambdas = np.logspace(-2, 2, 20)

In [None]:
train_r2 = []
test_r2 = []
train, test = finalDataset.randomSplit([0.7, 0.3])

for l in valid_lambdas:    
    lr = LinearRegression(featuresCol='features', labelCol='SalePrice', elasticNetParam=0, regParam=l)
    model = lr.fit(train)

    # Evaluate on training data
    summary_train = model.evaluate(train)
    train_r2.append(summary_train.r2)

    # Evaluate on training data
    summary_test = model.evaluate(test)
    test_r2.append(summary_test.r2)

    
plt.plot(valid_lambdas, train_r2, 'ro-', label='train')
plt.plot(valid_lambdas, test_r2, 'go-', label='test')
plt.xlabel('$\lambda$')
plt.ylabel('R2')
plt.xscale('log')
plt.legend()

In [None]:
# try to change lambda search and run this again

In [None]:
# Checking for L1

In [None]:
valid_lambdas = np.linspace(0, 10, 20)

In [None]:
train_r2 = []
test_r2 = []
train, test = finalDataset.randomSplit([0.7, 0.3])

for l in valid_lambdas:    
    lr = LinearRegression(featuresCol='features', labelCol='SalePrice', elasticNetParam=1, regParam=l)
    model = lr.fit(train)

    # Evaluate on training data
    summary_train = model.evaluate(train)
    train_r2.append(summary_train.r2)

    # Evaluate on training data
    summary_test = model.evaluate(test)
    test_r2.append(summary_test.r2)

    
plt.plot(valid_lambdas, train_r2, 'ro-', label='train')
plt.plot(valid_lambdas, test_r2, 'go-', label='test')
plt.xlabel('$\lambda$')
plt.ylabel('R2')
plt.xscale('log')
plt.legend()

In [None]:
pd.Series(np.array(model.coefficients)).hist(bins=100)