# PySpark 05 Linear Regression with Tips.csv
 
**Summary** 
- tips.csv
- `StringIndexer`
  - string >> ordinal encoding
  - `(inputCol='input_col', outputCol='output_col')`
  - `(inputCols=['col1', ... ], outputCols=['col', ...])`
  
- `from pyspark.ml.feature import StringIndexer`
- `from pyspark.ml.feature import VectorAssembler`
- `from pyspark.ml.regression import LinearRegression`

### Setup

In [1]:
import os
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("filterOps").getOrCreate()
spark

### Load

In [2]:
# read the dataset
ROOT = "C:\\PySpark\\"
FILE = "./tips.csv"
df = spark.read.csv(ROOT+FILE, header=True, inferSchema=True)
df

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: int]

In [4]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [6]:
df.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

##

**Syntax**
- `from pyspark.ml.feature import StringIndexer`

In [7]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='sex', outputCol='sex_indexed')
indexer

StringIndexer_0b78e0a0d33b

In [8]:
df1 = indexer.fit(df).transform(df)
df1.show()

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|        0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|        0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|        0.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|        0.0|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|        1.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|        0.0|
|     18.43| 3.0|  Male|    No|Sun|Dinne

In [11]:
indexer = StringIndexer(inputCols=['smoker', 'day', 'time'], 
                        outputCols=['smoker_idx', 'day_idx', 'time_idx'])
df_sm = indexer.fit(df).transform(df)
df_sm.show()

+----------+----+------+------+---+------+----+----------+-------+--------+
|total_bill| tip|   sex|smoker|day|  time|size|smoker_idx|day_idx|time_idx|
+----------+----+------+------+---+------+----+----------+-------+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|       0.0|    1.0|     0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|       0.0|    1.0|     0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|       0.0|    1.0|     0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|       0.0|    1.0|     0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|       0.0|    1.0|     0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|       0.0|    1.0|     0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|       0.0|    1.0|     0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|       0.0|    1.0|     0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|       0.0|    1.0|     0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|       0.0|    1.0|     0.0|
|     10.27|

In [13]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=['tip', 'size', 'smoker_idx', 'day_idx', 'time_idx'],
               outputCol='ind features')
output = va.transform(df_sm)

In [14]:
output.select('ind features').show()

+--------------------+
|        ind features|
+--------------------+
|[1.01,2.0,0.0,1.0...|
|[1.66,3.0,0.0,1.0...|
|[3.5,3.0,0.0,1.0,...|
|[3.31,2.0,0.0,1.0...|
|[3.61,4.0,0.0,1.0...|
|[4.71,4.0,0.0,1.0...|
|[2.0,2.0,0.0,1.0,...|
|[3.12,4.0,0.0,1.0...|
|[1.96,2.0,0.0,1.0...|
|[3.23,2.0,0.0,1.0...|
|[1.71,2.0,0.0,1.0...|
|[5.0,4.0,0.0,1.0,...|
|[1.57,2.0,0.0,1.0...|
|[3.0,4.0,0.0,1.0,...|
|[3.02,2.0,0.0,1.0...|
|[3.92,2.0,0.0,1.0...|
|[1.67,3.0,0.0,1.0...|
|[3.71,3.0,0.0,1.0...|
|[3.5,3.0,0.0,1.0,...|
|(5,[0,1],[3.35,3.0])|
+--------------------+
only showing top 20 rows



In [18]:
final_data = output.select('ind features', 'total_bill')
final_data.show(5)

+--------------------+----------+
|        ind features|total_bill|
+--------------------+----------+
|[1.01,2.0,0.0,1.0...|     16.99|
|[1.66,3.0,0.0,1.0...|     10.34|
|[3.5,3.0,0.0,1.0,...|     21.01|
|[3.31,2.0,0.0,1.0...|     23.68|
|[3.61,4.0,0.0,1.0...|     24.59|
+--------------------+----------+
only showing top 5 rows



In [19]:
from pyspark.ml.regression import LinearRegression
train, test = final_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='ind features', labelCol='total_bill')
regressor = regressor.fit(train)

In [20]:
regressor.coefficients

DenseVector([3.0988, 3.4418, 2.047, -0.4008, -0.5887])

In [21]:
regressor.intercept

1.1107412013182494

In [22]:
pred = regressor.evaluate(test)

In [27]:
# Final comparison
pred.predictions.show()

+--------------------+----------+------------------+
|        ind features|total_bill|        prediction|
+--------------------+----------+------------------+
|(5,[0,1],[1.47,2.0])|     10.77| 12.54961501617368|
| (5,[0,1],[1.5,2.0])|     26.41|12.642579285736087|
|(5,[0,1],[1.75,2.0])|     17.82|13.417281532089461|
|(5,[0,1],[1.97,2.0])|     12.02|14.099019508880431|
| (5,[0,1],[2.0,3.0])|     16.31|17.633796081591626|
|(5,[0,1],[2.23,2.0])|     15.77|14.904709845087941|
|(5,[0,1],[2.31,3.0])|     18.69|18.594426867069814|
| (5,[0,1],[3.0,2.0])|     15.06|17.290792763856334|
|(5,[0,1],[3.15,3.0])|     20.08| 21.19742641481715|
|(5,[0,1],[3.18,2.0])|     19.82|17.848578381230766|
|(5,[0,1],[3.27,2.0])|     17.78|18.127471189917976|
|(5,[0,1],[3.76,2.0])|     18.24|19.645887592770592|
|(5,[0,1],[4.67,3.0])|     35.83| 25.90761607264567|
|[1.0,2.0,1.0,0.0,...|      12.6|13.140142709099358|
|[1.01,2.0,0.0,1.0...|     16.99| 10.72340990342304|
|[1.1,2.0,1.0,0.0,...|      12.9| 13.450023607

In [25]:
pred.r2, pred.meanAbsoluteError, pred.meanSquaredError

(0.5174710648236165, 4.211872694982243, 35.900187891538586)