# **PySpark: Machine leargning approach**

### Library and PySpark Session

In [1]:
## Librerías
import numpy as np
import pandas as pd 
import pyspark

In [2]:
from pyspark.sql import SparkSession
#start session
spark = SparkSession.builder.appName('ML_intr').getOrCreate()
spark

### Load Dataset

In [6]:
## read data
df = spark.read.format("csv").load("data/housing.csv", header=True, sep=',',inferSchema=True, multiLine=True)
df = df.withColumnRenamed("Avg. Area Income", "Avg_Area_Income") \
       .withColumnRenamed("Avg. Area House Age", "Avg_Area_House_Age") \
       .withColumnRenamed("Avg. Area Number of Rooms", "Avg_Area_Number_of_Rooms") \
       .withColumnRenamed("Avg. Area Number of Bedrooms", "Avg_Area_Number_of_Bedrooms") \
       .withColumnRenamed("Area Population", "Area_Population")
df.show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|    63345.24005|       7.188236095|             5.586728665|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|    59982.19723|       5.0

## ML approach: Vector Assembler and Linear Regression
A feature transformer that merges several columns into one vector column.
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html

In [9]:
## Import vector assembler
from pyspark.ml.feature import VectorAssembler

## let predict price based on the following features: ave number of room and Avg. Area House Age
assembler = VectorAssembler(inputCols=['Avg_Area_Number_of_Rooms','Avg_Area_House_Age'], outputCol='features') 

## train assembler vector
output = assembler.transform(df)

## This create a columns with the combination of the selected features
output.show(5)

## select the features that we need
final_data = output.select('features','Price')

final_data.show(5)


+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|            features|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|[7.009188143,5.68...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|[6.730821019,6.00...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|[8.51272743,5.865...|
|    63345.24005|       7.188236095|    

In [14]:
## Split the data
train_data,test_data = final_data.randomSplit([0.7,0.3])

## Import Linear Regression
from pyspark.ml.regression import LinearRegression

## Create a Linear Regression Model object
lr = LinearRegression(featuresCol='features', labelCol='Price')

## Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

## regressor coefficients
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

## evaluate the model
test_results = lrModel.evaluate(test_data)

test_results.predictions.show(5)

## metrics

print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))


Coefficients: [117568.92911530565,159632.21737298174] Intercept: -544509.5799187213
+--------------------+-----------+------------------+
|            features|      Price|        prediction|
+--------------------+-----------+------------------+
|[4.290193826,7.23...|934744.0335|1115600.9473878723|
|[4.29822055,7.275...|1457230.646| 1122260.008563473|
|[4.321938664,7.12...|944491.0396| 1100742.050573925|
|[4.347851966,5.70...|957806.0027| 876655.2572450972|
|[4.407346082,7.07...|1586889.995|1102405.6910740556|
+--------------------+-----------+------------------+
only showing top 5 rows

RMSE: 290112.72514620423
MSE: 84165393291.75703
R2: 0.335058472291455
