In [33]:
# Import the SparkSession class
from pyspark.sql import SparkSession

# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('test') \
                    .getOrCreate()

# Read data from CSV file
flights = spark.read.csv("dataset/flights.csv",
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')


# View the first five records
flights.show(5)

# Check column data types
print(flights.dtypes)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| NULL|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| NULL|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows

[('mon', 'int'), ('dom', 'int'), ('dow', 'int'), ('carrier', 'string'), ('flight', 'int'), ('org', 'string'), ('mile', 'int'), ('depart', 'double'), ('duration', 'int'), ('delay', 'int')]


# Encoding flight origin

The org column in the flights data is a categorical variable giving the airport from which a flight departs.
since this is a categorical variable, it needs to be one-hot encoded before it can be used in a regression model.

In [34]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="org", outputCol="org_idx")
flights = indexer.fit(flights).transform(flights)
flights.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+-------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|org_idx|
+---+---+---+-------+------+---+----+------+--------+-----+-------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| NULL|    2.0|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|    0.0|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|    1.0|
+---+---+---+-------+------+---+----+------+--------+-----+-------+
only showing top 3 rows



In [35]:
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoder

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().orderBy('org_idx').show()

Exception ignored in: <function JavaWrapper.__del__ at 0x0000022762F7D430>
Traceback (most recent call last):
  File "c:\Users\88016\anaconda3\envs\env_py\lib\site-packages\pyspark\ml\wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LinearRegression' object has no attribute '_java_obj'


+---+-------+-------------+
|org|org_idx|    org_dummy|
+---+-------+-------------+
|ORD|    0.0|(7,[0],[1.0])|
|SFO|    1.0|(7,[1],[1.0])|
|JFK|    2.0|(7,[2],[1.0])|
|LGA|    3.0|(7,[3],[1.0])|
|SJC|    4.0|(7,[4],[1.0])|
|SMF|    5.0|(7,[5],[1.0])|
|TUS|    6.0|(7,[6],[1.0])|
|OGG|    7.0|    (7,[],[])|
+---+-------+-------------+



# Encoding shirt sizes

You have data for a consignment of t-shirts. The data includes the size of the shirt, which is given as either S, M, L or XL.

Here are the counts for the different sizes:
```
+----+-----+
|size|count|
+----+-----+
|   S|    8|
|   M|   15|
|   L|   20|
|  XL|    7|
+----+-----+
```
The sizes are first converted to an index using StringIndexer and then one-hot encoded using OneHotEncoder.

Which of the following is true:

- S shirts get index 2.0 and are one-hot encoded as `(3,[2],[1.0])`
- M shirts get index 1.0 and are one-hot encoded as `(3,[1],[1.0])`
- L shirts get index 0.0 and are one-hot encoded as `(3,[0],[1.0])`
- XL shirts get index 3.0 and are one-hot encoded as `(3,[3],[1.0])` (FALSE, it should be `(3,[],[])` for being least frequent)

# Flight duration model: Just distance

In this exercise you'll build a regression model to predict flight duration (the duration column).

For the moment you'll keep the model simple, including only the distance of the flight (the km column) as a predictor.

In [36]:
from pyspark.sql.functions import round
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')
flights.show(3)

+---+---+---+-------+------+---+------+--------+-----+-------+------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|org_idx|    km|
+---+---+---+-------+------+---+------+--------+-----+-------+------+
| 11| 20|  6|     US|    19|JFK|  9.48|     351| NULL|    2.0|3465.0|
|  0| 22|  2|     UA|  1107|ORD| 16.33|      82|   30|    0.0| 509.0|
|  2| 20|  4|     UA|   226|SFO|  6.17|      82|   -8|    1.0| 542.0|
+---+---+---+-------+------+---+------+--------+-----+-------+------+
only showing top 3 rows



In [37]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler(inputCols=['km'], outputCol='feature')

# Apply the VectorAssembler to the DataFrame
flights = vector_assembler.transform(flights)
flights_train , flights_test = flights.randomSplit([0.8, 0.2], seed=42)

# Create a regression object and train on training data
regression = LinearRegression(featuresCol= "feature", labelCol="duration").fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol="duration").evaluate(predictions)

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|560     |560.7038535078656 |
|310     |346.93268176430854|
|90      |85.06866770419609 |
|130     |133.6152160803417 |
|251     |245.45375976931263|
+--------+------------------+
only showing top 5 rows



17.096730471353727

# Interpreting the coefficients

The linear regression model for flight duration as a function of distance takes the form of which you need to find co-efficients of the features

In [38]:
# Intercept (average minutes on ground)
inter = regression.intercept
print(inter)

# Coefficients
coefs = regression.coefficients
print(coefs)

# Average minutes per km
minutes_per_km = regression.coefficients[0]
print(minutes_per_km)

# Average speed in km per hour
avg_speed = 60 / minutes_per_km
print(avg_speed)

44.38635769427969
[0.07561767659835762]
0.07561767659835762
793.465267634303


# Flight duration model: Adding origin airport

Some airports are busier than others. Some airports are bigger than others too. Flights departing from large or busy airports are likely to spend more time taxiing or waiting for their takeoff slot. So it stands to reason that the duration of a flight might depend not only on the distance being covered but also the airport from which the flight departs.

You are going to make the regression model a little more sophisticated by including the departure airport as a predictor.

In [39]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(featuresCol="feature", labelCol="duration").fit(flights_train)

# Create predictions for the testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
RegressionEvaluator(metricName="rmse",predictionCol="prediction", labelCol="duration").evaluate(predictions)

17.096730471353727

# Interpreting coefficients

In this exercise you'll be using the intercept and coefficients attributes to interpret the model.

In [40]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
onehot = OneHotEncoder(inputCol="org_idx", outputCol="onehot_features")
flights = flights.drop("feature")
flights = onehot.fit(flights).transform(flights)
assembler = VectorAssembler(inputCols=["onehot_features", "km"], outputCol="features")
flights = assembler.transform(flights)
flights_train , flights_test = flights.randomSplit([0.8, 0.2], seed=42)
regression = LinearRegression(featuresCol= "features", labelCol="duration").fit(flights_train)


In [44]:
predictions = regression.transform(flights_test)
predictions.show(3)

+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+--------------------+-----------------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|org_idx|    km|onehot_features|            features|       prediction|
+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+--------------------+-----------------+
|  0|  1|  2|     AA|    73|ORD|  9.08|     560|   39|    0.0|6828.0|  (7,[0],[1.0])|(8,[0,7],[1.0,682...|551.5973962731919|
|  0|  1|  2|     AA|   254|OGG| 15.33|     310|  173|    7.0|4001.0|      (7,[],[])|    (8,[7],[4001.0])|313.1084698157231|
|  0|  1|  2|     AA|   321|ORD| 14.17|      90| NULL|    0.0| 538.0|  (7,[0],[1.0])|(8,[0,7],[1.0,538...|84.30558533338989|
+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+--------------------+-----------------+
only showing top 3 rows



In [45]:
# Average speed in km per hour
avg_speed_hour = 60/regression.coefficients[0]
print(avg_speed_hour)

# Average minutes on ground at OGG
inter = regression.intercept
print(inter)

# Average minutes on ground at JFK
avg_ground_jfk = inter + regression.coefficients[3]
print(avg_ground_jfk)

# Average minutes on ground at LGA
avg_ground_lga = inter + regression.coefficients[4]
print(avg_ground_lga)

2.1076569514466468
15.86927497150248
62.631386351468024
34.156885367050286


# Bucketing departure time

Time of day data are a challenge with regression models. They are also a great candidate for bucketing.

In this lesson you will convert the flight departure times from numeric values between 0 (corresponding to 00:00) and 24 (corresponding to 24:00) to binned values. You'll then take those binned values and one-hot encode them.

In [46]:
from pyspark.ml.feature import Bucketizer, OneHotEncoder

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol="depart", outputCol="depart_bucket")

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select("depart", "depart_bucket").show(5)

# Create a one-hot encoder
onehot = OneHotEncoder(inputCols=["depart_bucket"], outputCols=["depart_dummy"])

# One-hot encode the bucketed departure times
flights_onehot = onehot.fit(bucketed).transform(bucketed)
flights_onehot.select("depart", "depart_bucket" , "depart_dummy").show(5)

+------+-------------+
|depart|depart_bucket|
+------+-------------+
|  9.48|          3.0|
| 16.33|          5.0|
|  6.17|          2.0|
| 10.33|          3.0|
|  8.92|          2.0|
+------+-------------+
only showing top 5 rows

+------+-------------+-------------+
|depart|depart_bucket| depart_dummy|
+------+-------------+-------------+
|  9.48|          3.0|(7,[3],[1.0])|
| 16.33|          5.0|(7,[5],[1.0])|
|  6.17|          2.0|(7,[2],[1.0])|
| 10.33|          3.0|(7,[3],[1.0])|
|  8.92|          2.0|(7,[2],[1.0])|
+------+-------------+-------------+
only showing top 5 rows



# Flight duration model: Adding departure time

In the previous exercise the departure time was bucketed and converted to dummy variables. Now you're going to include those dummy variables in a regression model for flight duration

In [49]:
flights_onehot.show(3)

+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+--------------------+-------------+-------------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|org_idx|    km|onehot_features|            features|depart_bucket| depart_dummy|
+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+--------------------+-------------+-------------+
| 11| 20|  6|     US|    19|JFK|  9.48|     351| NULL|    2.0|3465.0|  (7,[2],[1.0])|(8,[2,7],[1.0,346...|          3.0|(7,[3],[1.0])|
|  0| 22|  2|     UA|  1107|ORD| 16.33|      82|   30|    0.0| 509.0|  (7,[0],[1.0])|(8,[0,7],[1.0,509...|          5.0|(7,[5],[1.0])|
|  2| 20|  4|     UA|   226|SFO|  6.17|      82|   -8|    1.0| 542.0|  (7,[1],[1.0])|(8,[1,7],[1.0,542...|          2.0|(7,[2],[1.0])|
+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+--------------------+-------------+-------------+
only showing top 3 rows



In [50]:
flights = flights_onehot.drop("features","depart_bucket")
assembler = VectorAssembler(inputCols=["onehot_features", "km","depart_dummy"], outputCol="features")
flights = assembler.transform(flights)
flights_train , flights_test = flights.randomSplit([0.8, 0.2], seed=42)
regression = LinearRegression(featuresCol= "features", labelCol="duration").fit(flights_train)
flights.show(3)

+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+-------------+--------------------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|org_idx|    km|onehot_features| depart_dummy|            features|
+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+-------------+--------------------+
| 11| 20|  6|     US|    19|JFK|  9.48|     351| NULL|    2.0|3465.0|  (7,[2],[1.0])|(7,[3],[1.0])|(15,[2,7,11],[1.0...|
|  0| 22|  2|     UA|  1107|ORD| 16.33|      82|   30|    0.0| 509.0|  (7,[0],[1.0])|(7,[5],[1.0])|(15,[0,7,13],[1.0...|
|  2| 20|  4|     UA|   226|SFO|  6.17|      82|   -8|    1.0| 542.0|  (7,[1],[1.0])|(7,[2],[1.0])|(15,[1,7,10],[1.0...|
+---+---+---+-------+------+---+------+--------+-----+-------+------+---------------+-------------+--------------------+
only showing top 3 rows



In [51]:
# Find the RMSE on testing data
from pyspark.ml.evaluation import RegressionEvaluator
rmse = RegressionEvaluator(labelCol='duration').evaluate(predictions)
print("The test RMSE is", rmse)

# Average minutes on ground at OGG for flights departing between 21:00 and 24:00
avg_eve_ogg = regression.intercept + regression.coefficients[0]
print(avg_eve_ogg)

# Average minutes on ground at OGG for flights departing between 03:00 and 06:00
avg_night_ogg = regression.intercept + regression.coefficients[9]
print(avg_night_ogg)

# Average minutes on ground at JFK for flights departing between 03:00 and 06:00
avg_night_jfk = regression.intercept + regression.coefficients[3] + regression.coefficients[9]
print(avg_night_jfk)

The test RMSE is 11.06612411700972
37.600937704813745
11.108296473968375
56.847536232973404


# Flight duration model: More features!

Let's add more features to our model. This will not necessarily result in a better model. Adding some features might improve the model. Adding other features might make it worse.

More features will always make the model more complicated and difficult to interpret.

In [52]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit linear regression model to training data
regression = LinearRegression(labelCol="duration").fit(flights_train)

# Make predictions on testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol="duration").evaluate(predictions)
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

The test RMSE is 10.739666414749705
[27.21148698285345,20.120371742311743,51.660771743136635,45.73923975900503,17.54570737854217,14.961066263965431,17.2773184530285,0.07437193971557562,-14.687939367843523,0.718845752008081,4.190872842376593,6.96789335124698,4.704065371399039,8.882119147405417,8.783608300881815]


# Flight duration model: Regularization!

In the previous exercise you added more predictors to the flight duration model. The model performed well on testing data, but with so many coefficients it was difficult to interpret.

In this exercise you'll use Lasso regression (regularized with a L1 penalty) to create a more parsimonious model. Many of the coefficients in the resulting model will be set to zero. This means that only a subset of the predictors actually contribute to the model. Despite the simpler model, it still produces a good RMSE on the testing data.

In [53]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit Lasso model (λ = 1, α = 1) to training data
regression = LinearRegression(labelCol="duration", regParam=1, elasticNetParam=1).fit(flights_train)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol="duration").evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Number of zero coefficients
zero_coeff = sum([beta==0 for beta in regression.coefficients])
print("Number of coefficients equal to 0:", zero_coeff)

The test RMSE is 11.633714942221816
[5.50105829928655,0.0,28.8707664794491,22.01523873805985,0.0,-2.3916616818485816,0.0,0.07351140786201599,0.0,0.0,0.0,0.0,0.0,1.0280434522136144,1.1412353904957169]
Number of coefficients equal to 0: 8
