# PySpark 04 PySpark ML

**Summary** 


### Setup

In [46]:
import os 
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('pysparkML').getOrCreate()
spark

### Load

In [47]:
# read the dataset
ROOT = "C:\\Users\\0woo0\\OneDrive\\바탕 화면\\PySpark\\"
FILE = "./train.csv"
train = spark.read.csv(ROOT+FILE, header=True, inferSchema=True)
train

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [50]:
train.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [51]:
train.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



## 1. Linear Regression

**Syntax**  

- `from pyspark.ml.feature import VectorAssembler`
  - `va = VectorAssembler(inputCols=['col1', 'col2', ... ], outputCol='colx')`
  - `output = va.transform(train_df)`
  
- `from pyspark.ml.regression import LinearRegression`
  - `train, test = finalised_data.randomSplit([0.75, 0.25])`
  - `reg = LinearRegression(featuresCol='features', labelCol='Salary')`
  - `reg = reg.fit(train)`
  - `result = reg.evaluate(test)`
    - `result.predictions`
    - `result.meanAbsoluteError`
    - `result.meanSquaredError`

In [52]:
train.columns

['Name', 'age', 'Experience', 'Salary']

In [54]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=['age', 'Experience'], outputCol='features')
output = va.transform(train)
output.show()

+---------+---+----------+------+-----------+
|     Name|age|Experience|Salary|   features|
+---------+---+----------+------+-----------+
|    Krish| 31|        10| 30000|[31.0,10.0]|
|Sudhanshu| 30|         8| 25000| [30.0,8.0]|
|    Sunny| 29|         4| 20000| [29.0,4.0]|
|     Paul| 24|         3| 20000| [24.0,3.0]|
|   Harsha| 21|         1| 15000| [21.0,1.0]|
|  Shubham| 23|         2| 18000| [23.0,2.0]|
+---------+---+----------+------+-----------+



In [56]:
finalised_data = output.select("features", 'Salary')
finalised_data.show()

+-----------+------+
|   features|Salary|
+-----------+------+
|[31.0,10.0]| 30000|
| [30.0,8.0]| 25000|
| [29.0,4.0]| 20000|
| [24.0,3.0]| 20000|
| [21.0,1.0]| 15000|
| [23.0,2.0]| 18000|
+-----------+------+



In [58]:
from pyspark.ml.regression import LinearRegression

train, test = finalised_data.randomSplit([0.75, 0.25])
reg = LinearRegression(featuresCol='features', labelCol='Salary') # Salary is output feature.
reg = reg.fit(train)
reg

LinearRegressionModel: uid=LinearRegression_8424dd89d858, numFeatures=2

In [68]:
train.show(), test.show()

+-----------+------+
|   features|Salary|
+-----------+------+
| [21.0,1.0]| 15000|
| [23.0,2.0]| 18000|
| [24.0,3.0]| 20000|
| [30.0,8.0]| 25000|
|[31.0,10.0]| 30000|
+-----------+------+

+----------+------+
|  features|Salary|
+----------+------+
|[29.0,4.0]| 20000|
+----------+------+



(None, None)

In [59]:
reg.intercept

19919.060052212404

In [61]:
result = reg.evaluate(test)
result

<pyspark.ml.regression.LinearRegressionSummary at 0x14e38487370>

In [64]:
result.predictions.show()

+----------+------+-----------------+
|  features|Salary|       prediction|
+----------+------+-----------------+
|[29.0,4.0]| 20000|19342.03655352618|
+----------+------+-----------------+



In [70]:
result.meanAbsoluteError, result.meanSquaredError

(657.9634464738192, 432915.89689570636)