## Pyspark MLlib
1. Read csv data
2. Handling categorical features by `StringIndexer`, `OneHotEncoder` with `Pipeline`
3. Use `VectorAssembler` create new column to assembling the indepent variables for modeling
4. Split into training and testing --> fit model
5. Get estimated parameters and predictions


In [29]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('pyspark_practice_MLlib').getOrCreate()
spark

In [28]:
#### 1. Read data

pyspark_df = spark.read.csv('./Data/customer_for_ml.csv', header='true', inferSchema=True)
pyspark_df.show()
pyspark_df.columns

+------+---+---+------+----------+--------+
|  name|sex|age|salary|experience|location|
+------+---+---+------+----------+--------+
|   amy|  F| 22| 28000|         1|  taipei|
| jacky|  M| 20| 25000|         1|taichung|
| meggy|  F| 30| 35000|         8|  tainan|
|  adam|  M| 33| 34000|        10|  taipei|
|sophie|  F| 40| 60000|        20|  taipei|
|claire|  F| 25| 30000|         3|  tainan|
|sherry|  F| 27| 51000|         6|  tainan|
| phlip|  M| 31| 37000|         7|taichung|
| fanny|  F| 29| 49000|         5|taichung|
+------+---+---+------+----------+--------+



['name', 'sex', 'age', 'salary', 'experience', 'location']

In [17]:
#### 2. Handling categorical features by `StringIndexer`, `OneHotEncoder` with `Pipeline`
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer

#### only one feature transformation
# indexer = StringIndexer(inputCols=['sex','location'], outputCols=['indexed_sex','indexed_location'])
# pyspark_df = indexer.fit(pyspark_df).transform(pyspark_df)

#### multiple feature transformation with pipeline
indexer = [StringIndexer(inputCol=c, outputCol=f'indexed_{c}') for c in ['sex','location']]
encoder = [OneHotEncoder(inputCol=c, outputCol=f'onehot_{c}') for c in ['indexed_location']]
ppl = Pipeline(stages= indexer + encoder)
pyspark_df = ppl.fit(pyspark_df).transform(pyspark_df)
pyspark_df.show()

+------+---+---+------+----------+--------+-----------+----------------+-----------------------+
|  name|sex|age|salary|experience|location|indexed_sex|indexed_location|onehot_indexed_location|
+------+---+---+------+----------+--------+-----------+----------------+-----------------------+
|   amy|  F| 22| 28000|         1|  taipei|        0.0|             2.0|              (2,[],[])|
| jacky|  M| 20| 25000|         1|taichung|        1.0|             0.0|          (2,[0],[1.0])|
| meggy|  F| 30| 35000|         8|  tainan|        0.0|             1.0|          (2,[1],[1.0])|
|  adam|  M| 33| 34000|        10|  taipei|        1.0|             2.0|              (2,[],[])|
|sophie|  F| 40| 60000|        20|  taipei|        0.0|             2.0|              (2,[],[])|
|claire|  F| 25| 30000|         3|  tainan|        0.0|             1.0|          (2,[1],[1.0])|
|sherry|  F| 27| 51000|         6|  tainan|        0.0|             1.0|          (2,[1],[1.0])|
| phlip|  M| 31| 37000|       

In [18]:
#### 3. Use `VectorAssembler` create new column to assembling the indepent variables for modeling
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['indexed_sex','onehot_indexed_location','age','experience'], outputCol='features')
pyspark_df = assembler.transform(pyspark_df)
pyspark_df.show()

+------+---+---+------+----------+--------+-----------+----------------+-----------------------+--------------------+
|  name|sex|age|salary|experience|location|indexed_sex|indexed_location|onehot_indexed_location|            features|
+------+---+---+------+----------+--------+-----------+----------------+-----------------------+--------------------+
|   amy|  F| 22| 28000|         1|  taipei|        0.0|             2.0|              (2,[],[])|(5,[3,4],[22.0,1.0])|
| jacky|  M| 20| 25000|         1|taichung|        1.0|             0.0|          (2,[0],[1.0])|[1.0,1.0,0.0,20.0...|
| meggy|  F| 30| 35000|         8|  tainan|        0.0|             1.0|          (2,[1],[1.0])|[0.0,0.0,1.0,30.0...|
|  adam|  M| 33| 34000|        10|  taipei|        1.0|             2.0|              (2,[],[])|[1.0,0.0,0.0,33.0...|
|sophie|  F| 40| 60000|        20|  taipei|        0.0|             2.0|              (2,[],[])|(5,[3,4],[40.0,20...|
|claire|  F| 25| 30000|         3|  tainan|        0.0| 

In [24]:
#### 4. Split into training and testing --> fit model
from pyspark.ml.regression import LinearRegression

finalized_df = pyspark_df.select('features','salary')
train_df, test_df = finalized_df.randomSplit([0.7,0.3])
regressor = LinearRegression(featuresCol='features', labelCol='salary')
regressor = regressor.fit(train_df)

In [25]:
#### 5. Get estimated parameters and predictions
print('intercept: ', regressor.intercept)
print('coefficients: ', regressor.coefficients)

pred = regressor.evaluate(test_df)
pred.predictions.show()
print('evaluation:', pred.r2, pred.meanAbsoluteError, pred.meanSquaredError)

intercept:  85810.15564896734
coefficients:  [-41810.119769775105,24000.000000000586,-6936.706589925221,-3037.9760460453467,9025.317364030418]
+--------------------+------+------------------+
|            features|salary|        prediction|
+--------------------+------+------------------+
|(5,[3,4],[40.0,20...| 60000| 144797.4610877618|
|[0.0,0.0,1.0,30.0...| 35000| 59936.70658992506|
|[0.0,1.0,0.0,29.0...| 49000| 66835.43713380495|
|[1.0,1.0,0.0,20.0...| 25000|16265.832322316302|
+--------------------+------+------------------+

evaluation: -10.546728449185073 34075.94312229387 2051709311.3145728
