In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as fnc
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler

In [2]:
spark = SparkSession.builder.appName('Tutor_6_Preprocessing').getOrCreate()

In [3]:
tip_df = spark.read.csv('data/tips.csv',header=True, inferSchema=True)
tip_df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|   Yes|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



1. Explore data Structure & check Missing Values

In [12]:
tip_df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [4]:
list_cols = tip_df.columns
list_cols

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [5]:

list_filter_null = ["{} IS NULL".format(col) for col in list_cols]

In [6]:
counter_null =0
for col in list_filter_null:
    col_name = col.split(" ")[0]
    counter = tip_df.filter(col).count()
    counter_null+= counter
    print(f"{col_name} has {counter} null values")

total_bill has 0 null values
tip has 0 null values
sex has 0 null values
smoker has 0 null values
day has 0 null values
time has 0 null values
size has 0 null values


2. Pre-processing by:
- MinMaxScaler
- StandardScaler

Q1: Describe the difference from Total_bill & tip

Step 1: Select specified columns and assemble them
Step 2: Use transform() to create new DataFrame with transformed data 

In [7]:
bill_tip = VectorAssembler(inputCols=['total_bill','tip'], outputCol='bill_tip_feature')

In [8]:
tip_transformed = bill_tip.transform(tip_df)

In [9]:
tip_transformed.show(5)

+----------+----+------+------+---+------+----+----------------+
|total_bill| tip|   sex|smoker|day|  time|size|bill_tip_feature|
+----------+----+------+------+---+------+----+----------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    [16.99,1.01]|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    [10.34,1.66]|
|     21.01| 3.5|  Male|   Yes|Sun|Dinner|   3|     [21.01,3.5]|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    [23.68,3.31]|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    [24.59,3.61]|
+----------+----+------+------+---+------+----+----------------+
only showing top 5 rows



Step 3. Use pyspark.ml.feature.MinMaxScaler
- Initial a instance of MinMaxScaler()
- Use fit() to calculate min/ max value in 'bill_tip_feature' from 'tip_transformed'
- Apply transform() calculate min-max formula with each value on 'bill_tip_feature'
- Store transfered value in 'min_max_scaler' column

Min-Max formula:
x scaled = (x - min)/(max - min)

In [10]:
minMaxScal = MinMaxScaler(inputCol='bill_tip_feature', outputCol='min_max_scaled')

In [11]:
output = minMaxScal.fit(tip_transformed).transform(tip_transformed)

In [37]:
tip_transformed.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)
 |-- bill_tip_feature: vector (nullable = true)



In [56]:
from pyspark.sql.types import ArrayType, DoubleType

def round_array(arr):
    return [round(float(x), 4) for x in arr]

# Create a new user-defined function
round_udf = fnc.udf(round_array, ArrayType(DoubleType()))

In [61]:
output_rounded = output.select('bill_tip_feature',
    round_udf('min_max_scaled').alias('rounded_scaler'))

In [62]:
output_rounded.show(n=5, truncate=False)

+----------------+----------------+
|bill_tip_feature|rounded_scaler  |
+----------------+----------------+
|[16.99,1.01]    |[0.2916, 0.0011]|
|[10.34,1.66]    |[0.1523, 0.0733]|
|[21.01,3.5]     |[0.3758, 0.2778]|
|[23.68,3.31]    |[0.4317, 0.2567]|
|[24.59,3.61]    |[0.4508, 0.29]  |
+----------------+----------------+
only showing top 5 rows

