In [1]:
import findspark
findspark.init()

# New Section

In [2]:
from pyspark import SparkContext
sc =SparkContext(master="local", appName="New Spark Context")

In [3]:
# import libraries
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
from pyspark.sql.functions import mean, stddev, col, log
from pyspark.sql.functions import to_date, dayofweek, to_timestamp
from pyspark.sql import types 
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import year, month
from pyspark.sql.functions import dayofmonth, weekofyear
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import coalesce, first, lit
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import Bucketizer
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import datediff
from pyspark.sql.functions import when

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel

In [4]:
spark = SparkSession(sc)

In [5]:
# Use Spark to read in the flight csv file.
data = spark.read.csv("flights.csv", inferSchema=True,header=True)

In [6]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [7]:
print((data.count(), len(data.columns)))

(50000, 10)


In [12]:
data.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65|   NA|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [10]:
for item in data.head():
    print(item)

11
20
6
US
19
JFK
2153
9.48
351
NA


In [11]:
data.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'flight',
 'org',
 'mile',
 'depart',
 'duration',
 'delay']

In [16]:
assembler = VectorAssembler(
    inputCols=['mile'],
    outputCol='features') #inputs

In [17]:
data_pre = assembler.transform(data)

In [19]:
data_pre.show(2,False)

+---+---+---+-------+------+---+----+------+--------+-----+--------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|features|
+---+---+---+-------+------+---+----+------+--------+-----+--------+
|11 |20 |6  |US     |19    |JFK|2153|9.48  |351     |NA   |[2153.0]|
|0  |22 |2  |UA     |1107  |ORD|316 |16.33 |82      |30   |[316.0] |
+---+---+---+-------+------+---+----+------+--------+-----+--------+
only showing top 2 rows



In [20]:
final_data = data_pre.select('features','duration')

In [21]:
final_data.show(2)

+--------+--------+
|features|duration|
+--------+--------+
|[2153.0]|     351|
| [316.0]|      82|
+--------+--------+
only showing top 2 rows



In [22]:
# NA / NUL counts
final_data.count()

50000

In [23]:
final_data = final_data.na.drop()
final_data.count()

50000

In [24]:
final_data.show(5)

+--------+--------+
|features|duration|
+--------+--------+
|[2153.0]|     351|
| [316.0]|      82|
| [337.0]|      82|
|[1236.0]|     195|
| [258.0]|      65|
+--------+--------+
only showing top 5 rows



In [25]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [26]:
train_data.describe().show()

+-------+------------------+
|summary|          duration|
+-------+------------------+
|  count|             34935|
|   mean|151.90880206097037|
| stddev|  87.2657970175993|
|    min|                30|
|    max|               560|
+-------+------------------+



In [27]:
test_data.describe().show()

+-------+------------------+
|summary|          duration|
+-------+------------------+
|  count|             15065|
|   mean|151.43425157650182|
| stddev| 86.53303542122217|
|    min|                30|
|    max|               560|
+-------+------------------+



## BUild Model

In [28]:
# Create a linear regression model object
lr = LinearRegression(featuresCol='features',
                     labelCol='duration',
                     predictionCol='Predict_duration')

In [29]:
#fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

In [30]:
#print the coeefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [0.12168167545683359] Intercept: 44.32673256380748


In [31]:
test_results = lrModel.evaluate(test_data)

In [32]:
#Interesting results
test_results.residuals.show(5)

+------------------+
|         residuals|
+------------------+
|-9.479404819415336|
|-9.479404819415336|
|-8.479404819415336|
|-6.479404819415336|
|-6.479404819415336|
+------------------+
only showing top 5 rows



In [33]:
print("RMSE: {} ".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("r2: {}".format(test_results.r2))


RMSE: 16.97452254898054 
MSE: 288.1344157658488
r2: 0.961517782708794


In [47]:
#Check test dataset
test_model = lrModel.transform(test_data)

In [48]:
#Inspect results

In [49]:
from pyspark.ml.evaluation import RegressionEvaluator

In [53]:
print("RMSE: ", RegressionEvaluator(labelCol='duration',predictionCol='Predict_duration').evaluate(test_model))

RMSE:  16.97452254898054


In [37]:
test_model.select("Predict_duration", "duration").show(5)

+------------------+--------+
|  Predict_duration|duration|
+------------------+--------+
|52.479404819415336|      43|
|52.479404819415336|      43|
|52.479404819415336|      44|
|52.479404819415336|      46|
|52.479404819415336|      46|
+------------------+--------+
only showing top 5 rows



In [54]:
# Save model
lrModel.save('lrModel_Flights_50k')

In [55]:
#Load model from
lrModel2 = LinearRegressionModel.load('lrModel_Flights_50k')

In [56]:
#Predict new values (Assuming select test_Data)
unlabeled_data = test_data.select('features')

In [57]:
predictions = lrModel2.transform(unlabeled_data)

In [58]:
predictions.show(5)

+--------+------------------+
|features|  Predict_duration|
+--------+------------------+
|  [67.0]|52.479404819415336|
|  [67.0]|52.479404819415336|
|  [67.0]|52.479404819415336|
|  [67.0]|52.479404819415336|
|  [67.0]|52.479404819415336|
+--------+------------------+
only showing top 5 rows



In [59]:
test_data.show(5)

+--------+--------+
|features|duration|
+--------+--------+
|  [67.0]|      43|
|  [67.0]|      43|
|  [67.0]|      44|
|  [67.0]|      46|
|  [67.0]|      46|
+--------+--------+
only showing top 5 rows

