# Models

In [7]:
from pyspark.sql.functions import *
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [8]:
#spark ML imports
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StringIndexer, CountVectorizer, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [24]:
import pyspark.ml.feature
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)
from pyspark.ml.feature import StandardScaler

In [16]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
import tempfile
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler
from pyspark.sql.functions import col, explode, array, lit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

In [17]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

In [18]:
spark = SparkSession.builder.appName('airline').getOrCreate()
spark.sparkContext.version

'3.1.2'

# Airline Delay Time Prediction

## Data Processing & Feature Engineering

In [9]:
!hdfs dfs -ls  /user/nhu2/airline/

Found 3 items
-rw-r--r--   3 nhu2 nhu2 17953941457 2022-03-01 21:23 /user/nhu2/airline/Data_flights_clean.csv
-rw-r--r--   3 nhu2 nhu2  2002936814 2022-03-02 17:54 /user/nhu2/airline/Data_logistics_clean.csv
-rw-r--r--   3 nhu2 nhu2  2302318602 2022-03-08 14:11 /user/nhu2/airline/Data_regression_clean.csv


In [7]:
#Load data into PySpark
df = spark.read.csv("/user/nhu2/airline/Data_regression_clean.csv", inferSchema=True, header=True)
#df = spark.read.csv("gs://airline_bigdata/Data/regression_clean/part-00000-b781c44b-d818-4efc-99a2-80256e923831-c000.csv", inferSchema=True, header=True)

In [19]:
# check data type
df.dtypes

[('OP_CARRIER', 'string'),
 ('ORIGIN', 'string'),
 ('CRS_DEP_TIME', 'double'),
 ('TAXI_OUT', 'double'),
 ('TAXI_IN', 'double'),
 ('CRS_ELAPSED_TIME', 'double'),
 ('SEASON', 'string'),
 ('label', 'double')]

In [8]:
# dimension of dataset
print((df.count(), len(df.columns)))



(60431020, 8)


                                                                                

## Building Pipeline

In [85]:
## Rename label column to avoid illegal argument error

#df = df.withColumnRenamed("label","AVG_DELAY")
#df.dtypes

[('OP_CARRIER', 'string'),
 ('ORIGIN', 'string'),
 ('CRS_DEP_TIME', 'double'),
 ('TAXI_OUT', 'double'),
 ('TAXI_IN', 'double'),
 ('CRS_ELAPSED_TIME', 'double'),
 ('SEASON', 'string'),
 ('label', 'double')]

### Transformer

In [20]:
categoricalColumns = ["OP_CARRIER",
                     "ORIGIN",
                     "SEASON"]

numericalColumns = ["CRS_DEP_TIME",
                    "TAXI_OUT",
                   "TAXI_IN",
                   "CRS_ELAPSED_TIME"]

categoricalColumnsclassVec = [c + "classVec" for c in categoricalColumns]

In [25]:
stages = []
for categoricalColumn in categoricalColumns:
    print(categoricalColumn)
  ## Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalColumn, outputCol = categoricalColumn+"Index").setHandleInvalid("skip")
  ## Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCol=categoricalColumn+"Index", outputCol=categoricalColumn+"classVec")
  ## Add stages
    stages += [stringIndexer, encoder]

OP_CARRIER
ORIGIN
SEASON


In [26]:

catassembler = VectorAssembler(inputCols = categoricalColumnsclassVec,
                            outputCol = "catfeatures")
stages += [catassembler]

In [27]:
#Only scale the numeric columns 
numassembler = VectorAssembler(inputCols = numericalColumns,
                            outputCol = "numfeatures")
stages += [numassembler]

In [28]:
scaler = StandardScaler(inputCol = "numfeatures",
                        outputCol = "scaledFeatures",
                        withStd = True,
                        withMean = True)
stages += [scaler]

In [29]:
# Combine both 
totalassembler = VectorAssembler(inputCols = ['catfeatures','scaledFeatures'],
                            outputCol = "features")
stages += [totalassembler]

In [30]:
pipeline = Pipeline(stages=stages)

In [31]:
## Suspect that the error comes from transforming separately?
# Yep 
# Although this introduces data leakage, it seems like splitting before transforming will cause illegal argument error
df = pipeline.fit(df).transform(df)

                                                                                

In [32]:
train_df, test_df = df.select(['features', 'label']).randomSplit([.8,.2],seed=1234)

In [24]:
#train_df.show(3,truncate=False)

+------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                      |label|
+------------------------------------------------------------------------------------------------------------------------------+-----+
|(406,[0,25,399,402,403,404,405],[1.0,1.0,1.0,-1.5301571430162986,-0.8586483440519849,-0.3997735200289499,0.31077443596073556])|-8.0 |
|(406,[0,25,399,402,403,404,405],[1.0,1.0,1.0,-1.5301571430162986,-0.7511117112189889,-0.5888811502527409,-0.7098328064051183])|-8.0 |
|(406,[0,25,399,402,403,404,405],[1.0,1.0,1.0,-1.5301571430162986,-0.7511117112189889,-0.5888811502527409,-0.7098328064051183])|-4.0 |
+------------------------------------------------------------------------------------------------------------------------------+-----+
only showing top 3 rows



In [22]:
#test_df.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(406,[0,25,399,40...| -7.5|
|(406,[0,25,399,40...| -7.5|
|(406,[0,25,399,40...| -7.0|
+--------------------+-----+
only showing top 3 rows



### Multiple Linear Regression

In [19]:
lr = LinearRegression(featuresCol = 'features', labelCol='label')
m1 = lr.fit(train_df)

In [20]:
reg_pred = m1.transform(test_df)

RMSE on test data = 36.6294


In [23]:
evalu = RegressionEvaluator(predictionCol="prediction", labelCol="label")

rmse = evalu.evaluate(reg_pred, {evalu.metricName: "rmse"})
print("RMSE: %.3f" % rmse)
mae = evalu.evaluate(reg_pred, {evalu.metricName: "mae"})
print("MAE: %.3f" % mae)

RMSE: 36.585
MAE: 17.941


### Penalized Linear Regression with Cross Validation

In [24]:
lr = LinearRegression(featuresCol = 'features', labelCol='label')
lrevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
#Define parameter grid
lrparamGrid = (ParamGridBuilder()
             .addGrid(lr.maxIter, [200, 500, 1000])
             .addGrid(lr.regParam, [0.01, 0.1, 1, 10, 20])
             .addGrid(lr.elasticNetParam, [0, 0.1, 0.25, 0.5, 0.75, 1])
             .build())

In [25]:
lrcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = lrparamGrid,
                    evaluator = lrevaluator,
                    numFolds = 3,
                    parallelism=2)

In [None]:
cv_reg = lrcv.fit(train_df)

In [None]:
cv_reg_pred = cv_reg.bestModel.transform(test_df)

In [32]:
cv_reg.bestModel._java_obj.getRegParam()

0.01

In [33]:
cv_reg.bestModel._java_obj.getMaxIter()

1

In [34]:
cv_reg.bestModel._java_obj.getElasticNetParam()

0.0

In [28]:
# Root Mean Square Error
rmse = evalu.evaluate(cv_reg_pred, {evalu.metricName: "rmse"})
print("RMSE: %.3f" % rmse)

# Mean Absolute Error
mae = evalu.evaluate(cv_reg_pred, {evalu.metricName: "mae"})
print("MAE: %.3f" % mae)

RMSE: 36.585
MAE: 17.949


### Random Forest

In [11]:
#Load data into PySpark
df = spark.read.csv("gs://dataproc-staging-us-central1-1082458152958-9ewhbauy/Cleaned_Data/Data_regression_clean_part-00000-b781c44b-d818-4efc-99a2-80256e923831-c000.csv", inferSchema=True, header=True)


                                                                                

In [35]:
rf = RandomForestRegressor(labelCol="label", featuresCol="features")

In [39]:
model = rf.fit(train_df)

                                                                                

In [40]:
pred_rf = model.transform(test_df)

In [41]:
evalu = RegressionEvaluator(predictionCol="prediction", labelCol="label")
# Root Mean Square Error
rmse = evalu.evaluate(pred_rf, {evalu.metricName: "rmse"})
print("RMSE: %.3f" % rmse)

# Mean Absolute Error
mae = evalu.evaluate(pred_rf, {evalu.metricName: "mae"})
print("MAE: %.3f" % mae)

                                                                                

RMSE: 36.900




MAE: 18.232


                                                                                