# **1. Install and import necessary libaries** #

In [1]:
# Install pyspark
!pip install pyspark



In [2]:
# Import Spark Session
from pyspark.sql import SparkSession

In [3]:
# Create Spark Session
spark = SparkSession.builder.appName("Airfoil_Noise_Predictio_Spark").getOrCreate()
spark.version

'3.5.0'

# **2. Load, Explore and Transform dataset** #

In [4]:
# Connect google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Read dataset
df_airfoil_noise = spark.read.csv("/content/drive/MyDrive/Machine_Learning_wtih_Spark/dataset/NASA_airfoil_noise_raw.csv"
                                  , header = True, inferSchema = True)

In [6]:
# Show the first 5 row
df_airfoil_noise.show(5)

+---------+-------------+-----------+------------------+-----------------------+----------+
|Frequency|AngleOfAttack|ChordLength|FreeStreamVelocity|SuctionSideDisplacement|SoundLevel|
+---------+-------------+-----------+------------------+-----------------------+----------+
|      800|          0.0|     0.3048|              71.3|             0.00266337|   126.201|
|     1000|          0.0|     0.3048|              71.3|             0.00266337|   125.201|
|     1250|          0.0|     0.3048|              71.3|             0.00266337|   125.951|
|     1600|          0.0|     0.3048|              71.3|             0.00266337|   127.591|
|     2000|          0.0|     0.3048|              71.3|             0.00266337|   127.461|
+---------+-------------+-----------+------------------+-----------------------+----------+
only showing top 5 rows



In [7]:
# Count the total number of initial rows
initial_total_rows = df_airfoil_noise.count()

In [8]:
# Drop all duplicates value from dataframe
df_airfoil_noise = df_airfoil_noise.drop_duplicates()

# Count the total number of rows after drop duplicates
count_after_drop_duplicates = df_airfoil_noise.count()

In [9]:
# Drop null values from dataframe
df_airfoil_noise = df_airfoil_noise.dropna()

# Count the total number of rows after drop values
count_after_drop_null = df_airfoil_noise.count()

In [10]:
# Rename the column "SoundLevel" to "SoundLevelDecibels"
df_airfoil_noise = df_airfoil_noise.withColumnRenamed("SoundLevel", "SoundLevelDecibels")

In [11]:
# Save as parquet file
df_airfoil_noise.write.mode("overwrite") \
          .parquet("/content/drive/MyDrive/Machine_Learning_wtih_Spark/parquet_file/Airfoil_Noise_Data.parquet")

## 2.1. Show the sumary after Transformation ##

In [12]:
print("Total number of rows: ", initial_total_rows)
print("Total number of rows after drop duplicates: ", count_after_drop_duplicates)
print("Total number of rows after drop null values: ", count_after_drop_null)
print("Print new column name = ", df_airfoil_noise.columns[-1])

# Exist parquet file (True, False)
import os
print("NASA_airfoil_noise_cleaned.parquet exists :",
      os.path.isdir("/content/drive/MyDrive/Machine_Learning_wtih_Spark/parquet_file/Airfoil_Noise_Data.parquet"))

Total number of rows:  1522
Total number of rows after drop duplicates:  1503
Total number of rows after drop null values:  1499
Print new column name =  SoundLevelDecibels
NASA_airfoil_noise_cleaned.parquet exists : True


## 2.2. Create Machine Learning Pipeline ##

In [13]:
# Import functions/Classes for sparkml

from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer

# Import functions/Classes for pipeline creation
from pyspark.ml import Pipeline

# Import functions/Classes for metrics (evaluatio model)
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import numpy as np

In [14]:
# Load data from parquet file into dataframe
df_airfoil_noise_parquet = spark.read.parquet("/content/drive/MyDrive/Machine_Learning_wtih_Spark/parquet_file/Airfoil_Noise_Data.parquet")

# Show first 5 rows
df_airfoil_noise_parquet.show(5)

+---------+-------------+-----------+------------------+-----------------------+------------------+
|Frequency|AngleOfAttack|ChordLength|FreeStreamVelocity|SuctionSideDisplacement|SoundLevelDecibels|
+---------+-------------+-----------+------------------+-----------------------+------------------+
|     4000|          3.0|     0.3048|              31.7|             0.00529514|           115.608|
|     3150|          2.0|     0.2286|              31.7|             0.00372371|           121.527|
|     2000|          7.3|     0.2286|              31.7|              0.0132672|           115.309|
|     2000|          5.4|     0.1524|              71.3|             0.00401199|           131.111|
|      500|          9.9|     0.1524|              71.3|              0.0193001|           131.279|
+---------+-------------+-----------+------------------+-----------------------+------------------+
only showing top 5 rows



In [15]:
# Print the total number of rows
print("Total number of rows: ", df_airfoil_noise.count())

Total number of rows:  1499


In [16]:
# Stage 1: Define VectorAssembler stages for Crete features column. Except SoundLevelDecibels
vectorAssember = VectorAssembler(inputCols = ["Frequency", "AngleOfAttack"
                                             , "ChordLength", "FreeStreamVelocity"
                                             , "SuctionSideDisplacement"]
                                              , outputCol = "features")

# Stage 2: Scale features
scale = StandardScaler(inputCol = "features", outputCol = "scaledFeatures")

# Stage 3: Create the instance of model
lr = LinearRegression(featuresCol = "scaledFeatures", labelCol = "SoundLevelDecibels")

In [17]:
# Build the pipeline
pipeline = Pipeline(stages = [vectorAssember, scale, lr])

# Split data into two: training and testing
(training_data, testing_data) = df_airfoil_noise_parquet.randomSplit([0.7, 0.3], seed = 42)

# Train model
model = pipeline.fit(training_data)

In [18]:
# Summary of model
ps = [str(x).split("_")[0] for x in pipeline.getStages()]

print("Pipeline Stage 1 = ", ps[0])
print("Pipeline Stage 2 = ", ps[1])
print("Pipeline Stage 3 = ", ps[2])

print("Label column = ", lr.getLabelCol())

Pipeline Stage 1 =  VectorAssembler
Pipeline Stage 2 =  StandardScaler
Pipeline Stage 3 =  LinearRegression
Label column =  SoundLevelDecibels


In [20]:
lrModel = model.stages[-1]
coefficients = lrModel.coefficients
intercept = lrModel.intercept

print("Coefficients: ", np.round(coefficients, 2))
print("Intercept: ", np.round(intercept, 2))

Coefficients:  [-3.99 -2.29 -3.33  1.48 -2.06]
Intercept:  132.88


# **3. Evaluate model** #

In [21]:
# Predict on test data
predictions = model.transform(testing_data)
predictions.select("SoundLevelDecibels", "prediction").show(5)

+------------------+------------------+
|SoundLevelDecibels|        prediction|
+------------------+------------------+
|           128.679|122.59722914376775|
|            133.42|127.37968204568844|
|           119.146| 130.3407742507451|
|           116.074|131.11016975113546|
|           134.319|127.12627360125104|
+------------------+------------------+
only showing top 5 rows



In [22]:
evaluator_mse = RegressionEvaluator(labelCol="SoundLevelDecibels", predictionCol="prediction", metricName="mse")
mse = evaluator_mse.evaluate(predictions)

evaluator_mae = RegressionEvaluator(labelCol="SoundLevelDecibels", predictionCol="prediction", metricName="mae")
mae = evaluator_mae.evaluate(predictions)

evaluator_r2 = RegressionEvaluator(labelCol="SoundLevelDecibels", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)

In [23]:
# Print the result of evaluate model
print("Mean Squared Error =", np.round(mse, 2))
print("Mean Absolute Error =", np.round(mae, 2))
print("R Squared =", np.round(r2, 2))

Mean Squared Error = 25.0
Mean Absolute Error = 3.91
R Squared = 0.5


# **4. Save model** #

In [24]:
# Save model to model_airfoil_noise
model.write().overwrite().save("/content/drive/MyDrive/Machine_Learning_wtih_Spark/model_airfoil_noise")

# **5. Stop Spark Sesstion** #

In [25]:
spark.stop()