# Pyspark MLib

- There are two type of machine learning in spark
1. RDD Technique
2. Dataframe API (famously used)

#### Set spark instances

In [3]:
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test01').getOrCreate()

In [2]:
spark

#### Set dataset 

In [4]:
# read dataset
# notice the NULL value

training = spark.read.csv('customer.csv', header =True, inferSchema = True)
training.show()

+-------+----+----------+------+----------------+
|   Name| Age|Experience|Salary|            Role|
+-------+----+----------+------+----------------+
|  Zikri|  24|         1|  4500|        Helpdesk|
|Zakhwan|  25|         1|  4700|      Accounting|
|   Amir|  27|         2|  5000|        Helpdesk|
|  Ammar|  30|         6|  8500|Customer Service|
|  Haziq|  24|         2|  4700|Customer Service|
|  Irfan|  25|         1|  6000|         Payroll|
|  Fahmi|  27|         5|  7000|         Payroll|
|  Majid|  27|      NULL|  5500|        Helpdesk|
|   NULL|  25|         2|  5000|        Engineer|
|    Ali|  25|         2|  NULL|        Engineer|
|    Abu|NULL|      NULL|  NULL|            NULL|
+-------+----+----------+------+----------------+



#### Delete NA

In [6]:
training = training.na.drop()
training.show()

+-------+---+----------+------+----------------+
|   Name|Age|Experience|Salary|            Role|
+-------+---+----------+------+----------------+
|  Zikri| 24|         1|  4500|        Helpdesk|
|Zakhwan| 25|         1|  4700|      Accounting|
|   Amir| 27|         2|  5000|        Helpdesk|
|  Ammar| 30|         6|  8500|Customer Service|
|  Haziq| 24|         2|  4700|Customer Service|
|  Irfan| 25|         1|  6000|         Payroll|
|  Fahmi| 27|         5|  7000|         Payroll|
+-------+---+----------+------+----------------+



#### Pyspark ML 

- Make linear regression model in spark
- Predict salary based on age and experience

In [16]:
# basic info of df

In [7]:
training.show()

+-------+---+----------+------+----------------+
|   Name|Age|Experience|Salary|            Role|
+-------+---+----------+------+----------------+
|  Zikri| 24|         1|  4500|        Helpdesk|
|Zakhwan| 25|         1|  4700|      Accounting|
|   Amir| 27|         2|  5000|        Helpdesk|
|  Ammar| 30|         6|  8500|Customer Service|
|  Haziq| 24|         2|  4700|Customer Service|
|  Irfan| 25|         1|  6000|         Payroll|
|  Fahmi| 27|         5|  7000|         Payroll|
+-------+---+----------+------+----------------+



In [8]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Role: string (nullable = true)



In [9]:
training.columns

['Name', 'Age', 'Experience', 'Salary', 'Role']

 Define independent variable from dependent variable\
 Create vector assembler

[Age, Experience] ----> New feature ---> Independent feature

In [10]:
# to group these independent feature we use VectorAssembler by pyspark

from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['Age','Experience'], outputCol = 'Independent Features')

In [12]:
# take training data as input

output = featureassembler.transform(training)

In [14]:
# show the independent feature that we just created
output.show()

+-------+---+----------+------+----------------+--------------------+
|   Name|Age|Experience|Salary|            Role|Independent Features|
+-------+---+----------+------+----------------+--------------------+
|  Zikri| 24|         1|  4500|        Helpdesk|          [24.0,1.0]|
|Zakhwan| 25|         1|  4700|      Accounting|          [25.0,1.0]|
|   Amir| 27|         2|  5000|        Helpdesk|          [27.0,2.0]|
|  Ammar| 30|         6|  8500|Customer Service|          [30.0,6.0]|
|  Haziq| 24|         2|  4700|Customer Service|          [24.0,2.0]|
|  Irfan| 25|         1|  6000|         Payroll|          [25.0,1.0]|
|  Fahmi| 27|         5|  7000|         Payroll|          [27.0,5.0]|
+-------+---+----------+------+----------------+--------------------+



In [15]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Role', 'Independent Features']

In [18]:
# Dependent feature is our output feature

finalized_data = output.select("Independent Features", "Salary")

In [19]:
# this is our data that we want to predict (build linear model)

finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [24.0,1.0]|  4500|
|          [25.0,1.0]|  4700|
|          [27.0,2.0]|  5000|
|          [30.0,6.0]|  8500|
|          [24.0,2.0]|  4700|
|          [25.0,1.0]|  6000|
|          [27.0,5.0]|  7000|
+--------------------+------+



In [22]:
# train-test-split (like sklearn)
# train dataset has 75% of data and the rest is test data

from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.75,0.25])

# linear regression model, set apart feature and label features
# we have 2 feature columns

regressor = LinearRegression(featuresCol = 'Independent Features', labelCol = 'Salary')
regressor = regressor.fit(train_data)

In [23]:
# Coefficients of the model
regressor.coefficients

DenseVector([260.4938, 387.6543])

In [24]:
# Intercept
regressor.intercept

-1929.6296296294151

In [25]:
# Prediction (set the prediction to the test data)
# Not too far away

pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [24.0,1.0]|  4500| 4709.876543209884|
|          [25.0,1.0]|  4700|4970.3703703703695|
+--------------------+------+------------------+



In [32]:
# Parameter

pred_results.meanAbsoluteError,pred_results.meanSquaredError

(240.12345679012697, 58574.15028197064)