In [10]:
# Import the SparkSession class
from pyspark.sql import SparkSession

# Create SparkSession object
spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()

In [11]:
# Read data from CSV file
flights = spark.read.csv(
    "data/flights.csv", sep=",", header=True, inferSchema=True, nullValue="NA"
)

In [12]:
from pyspark.ml.feature import StringIndexer
flights = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights).transform(flights)
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+-------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|org_idx|
+---+---+---+-------+------+---+----+------+--------+-----+-------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| NULL|    2.0|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|    0.0|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|    1.0|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|    0.0|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| NULL|    0.0|
+---+---+---+-------+------+---+----+------+--------+-----+-------+
only showing top 5 rows



In [13]:
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoder

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

+---+-------+-------------+
|org|org_idx|    org_dummy|
+---+-------+-------------+
|ORD|    0.0|(7,[0],[1.0])|
|SFO|    1.0|(7,[1],[1.0])|
|JFK|    2.0|(7,[2],[1.0])|
|LGA|    3.0|(7,[3],[1.0])|
|SJC|    4.0|(7,[4],[1.0])|
|SMF|    5.0|(7,[5],[1.0])|
|TUS|    6.0|(7,[6],[1.0])|
|OGG|    7.0|    (7,[],[])|
+---+-------+-------------+



In [14]:
# Import the necessary class
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 
    # 'carrier_idx', 
    'org_idx', 
    # 'km',
    'depart', 'duration'
], outputCol='features')

# Consolidate predictor columns
flights = assembler.transform(flights_onehot)

# Check the resulting column
flights.select('features', 'delay').show(5, truncate=False)

+------------------------------+-----+
|features                      |delay|
+------------------------------+-----+
|[11.0,20.0,6.0,2.0,9.48,351.0]|NULL |
|[0.0,22.0,2.0,0.0,16.33,82.0] |30   |
|[2.0,20.0,4.0,1.0,6.17,82.0]  |-8   |
|[9.0,13.0,1.0,0.0,10.33,195.0]|-5   |
|[4.0,2.0,5.0,0.0,8.92,65.0]   |NULL |
+------------------------------+-----+
only showing top 5 rows



In [17]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=43)

# Check that training set has around 80% of records
training_ratio = flights_train.count() / flights.count()
print(training_ratio)

0.80224


In [19]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol='duration').evaluate(predictions)

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|370     |370.0             |
|379     |379.0             |
|310     |310.0             |
|135     |134.99999999999997|
|80      |79.99999999999997 |
+--------+------------------+
only showing top 5 rows



2.038988885911021e-14

In [20]:
# Intercept (average minutes on ground)
inter = regression.intercept
print(inter)

# Coefficients
coefs = regression.coefficients
print(coefs)

# Average minutes per km
minutes_per_km = regression.coefficients[0]
print(minutes_per_km)

# Average speed in km per hour
avg_speed = 60 / minutes_per_km
print(avg_speed)

-5.5529772905668064e-14
[4.3340831942090705e-16,3.2006956885441626e-16,1.1283273924159106e-15,8.045997111959331e-16,7.274626562625612e-16,1.0000000000000002]
4.3340831942090705e-16
1.384375825553331e+17


In [21]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
RegressionEvaluator(labelCol='duration').evaluate(predictions)

2.038988885911021e-14

In [22]:
# Average speed in km per hour
avg_speed_hour = 60 / regression.coefficients[0]
print(avg_speed_hour)

# Average minutes on ground at OGG
inter = regression.intercept
print(inter)

# Average minutes on ground at JFK
avg_ground_jfk = inter + regression.coefficients[3]
print(avg_ground_jfk)

# Average minutes on ground at LGA
avg_ground_lga = inter + regression.coefficients[4]
print(avg_ground_lga)

1.384375825553331e+17
-5.5529772905668064e-14
-5.472517319447213e-14
-5.4802310249405505e-14


In [24]:
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+-------+-------------+--------------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|org_idx|    org_dummy|            features|
+---+---+---+-------+------+---+----+------+--------+-----+-------+-------------+--------------------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| NULL|    2.0|(7,[2],[1.0])|[11.0,20.0,6.0,2....|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|    0.0|(7,[0],[1.0])|[0.0,22.0,2.0,0.0...|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|    1.0|(7,[1],[1.0])|[2.0,20.0,4.0,1.0...|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|    0.0|(7,[0],[1.0])|[9.0,13.0,1.0,0.0...|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| NULL|    0.0|(7,[0],[1.0])|[4.0,2.0,5.0,0.0,...|
+---+---+---+-------+------+---+----+------+--------+-----+-------+-------------+--------------------+
only showing top 5 rows



In [23]:
from pyspark.ml.feature import Bucketizer, OneHotEncoder

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol='depart', outputCol='depart_bucket')

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select('depart', 'depart_bucket').show(5)

# Create a one-hot encoder
onehot = OneHotEncoder(inputCols=['depart_bucket'], outputCols=['depart_dummy'])

# One-hot encode the bucketed departure times
flights_onehot = onehot.fit(bucketed).transform(bucketed)
flights_onehot.select('depart', 'depart_bucket', 'depart_dummy').show(5)

+------+-------------+
|depart|depart_bucket|
+------+-------------+
|  9.48|          3.0|
| 16.33|          5.0|
|  6.17|          2.0|
| 10.33|          3.0|
|  8.92|          2.0|
+------+-------------+
only showing top 5 rows

+------+-------------+-------------+
|depart|depart_bucket| depart_dummy|
+------+-------------+-------------+
|  9.48|          3.0|(7,[3],[1.0])|
| 16.33|          5.0|(7,[5],[1.0])|
|  6.17|          2.0|(7,[2],[1.0])|
| 10.33|          3.0|(7,[3],[1.0])|
|  8.92|          2.0|(7,[2],[1.0])|
+------+-------------+-------------+
only showing top 5 rows



In [28]:
# # Find the RMSE on testing data
# from pyspark.ml.evaluation import RegressionEvaluator
# rmse = RegressionEvaluator(labelCol='duration').evaluate(predictions)
# print("The test RMSE is", rmse)

# # Average minutes on ground at OGG for flights departing between 21:00 and 24:00
# avg_eve_ogg = regression.intercept
# print(avg_eve_ogg)

# # Average minutes on ground at OGG for flights departing between 03:00 and 06:00
# avg_night_ogg = regression.intercept + regression.coefficients[9]
# print(avg_night_ogg)

# # Average minutes on ground at JFK for flights departing between 03:00 and 06:00
# avg_night_jfk = regression.intercept + regression.coefficients[9] + regression.coefficients[3]
# print(avg_night_jfk)

In [29]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit linear regression model to training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Make predictions on testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(predictions)
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

The test RMSE is 2.038988885911021e-14
[4.3340831942090705e-16,3.2006956885441626e-16,1.1283273924159106e-15,8.045997111959331e-16,7.274626562625612e-16,1.0000000000000002]


In [30]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit Lasso model (λ = 1, α = 1) to training data
regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Number of zero coefficients
zero_coeff = sum([beta == 0 for beta in regression.coefficients])
print("Number of coefficients equal to 0:", zero_coeff)

The test RMSE is 0.9974271621644267
[0.0,0.0,0.0,0.0,0.0,0.9885175118869292]
Number of coefficients equal to 0: 5
