Set up a google cloud storage bucket where your raw files are hosted:

In [1]:
PROJECT=!gcloud config get-value project
PROJECT=PROJECT[0]
BUCKET = PROJECT + '-dsongcp'
import os
os.environ['BUCKET'] = PROJECT + '-dsongcp'

Create a spark session using the following code block:

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
sc = SparkContext('local', 'logistic')
spark = SparkSession \
    .builder \
    .appName("Logistic regression w/ Spark ML") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/04 05:22:47 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
24/04/04 05:22:47 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
24/04/04 05:22:47 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
24/04/04 05:22:47 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


### Create a Spark DataFrame

In [3]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint

Read the dataset

In [4]:
traindays = spark.read \
    .option("header", "true") \
    .csv('gs://{}/flights/trainday.csv'.format(BUCKET))
traindays.createOrReplaceTempView('traindays')

                                                                                

Create a SparkSQL view

In [5]:
traindays.createOrReplaceTempView('traindays')

In [6]:
spark.sql("SELECT * from traindays LIMIT 5").show()

+----------+------------+
|   FL_DATE|is_train_day|
+----------+------------+
|2015-01-01|        True|
|2015-01-02|       False|
|2015-01-03|       False|
|2015-01-04|        True|
|2015-01-05|        True|
+----------+------------+



In [7]:
inputs = 'gs://{}/flights/tzcorr/all_flights-*'.format(BUCKET)

Read the data into Spark SQL from the input file you created:

In [8]:
flights = spark.read.json(inputs)
flights.createOrReplaceTempView('flights')

                                                                                

In [9]:
trainquery = """
SELECT
  DEP_DELAY, TAXI_OUT, ARR_DELAY, DISTANCE
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
  t.is_train_day == 'True'
"""
traindata = spark.sql(trainquery)

In [10]:
print(traindata.head(2))

[Row(DEP_DELAY=-6.0, TAXI_OUT=18.0, ARR_DELAY=-11.0, DISTANCE='425.00'), Row(DEP_DELAY=-1.0, TAXI_OUT=14.0, ARR_DELAY=-8.0, DISTANCE='425.00')]


In [None]:
traindata.describe().show()



In [None]:
trainquery = """
SELECT
  DEP_DELAY, TAXI_OUT, ARR_DELAY, DISTANCE
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
  t.is_train_day == 'True' AND
  f.dep_delay IS NOT NULL AND 
  f.arr_delay IS NOT NULL
"""
traindata = spark.sql(trainquery)
traindata.describe().show()

In [None]:
trainquery = """
SELECT
  DEP_DELAY, TAXI_OUT, ARR_DELAY, DISTANCE
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
  t.is_train_day == 'True' AND
  f.dep_delay IS NOT NULL AND 
  f.arr_delay IS NOT NULL
"""
traindata = spark.sql(trainquery)
traindata.describe().show()

In [None]:
def to_example(fields):
    return LabeledPoint(\
              float(fields['ARR_DELAY'] < 15), #ontime? \
              [ \
                  fields['DEP_DELAY'], \
                  fields['TAXI_OUT'],  \
                  fields['DISTANCE'],  \
              ])

In [None]:
examples = traindata.rdd.map(to_example)

In [None]:
lrmodel = LogisticRegressionWithLBFGS.train(examples, intercept=True)

In [None]:
print(lrmodel.weights,lrmodel.intercept)

This cell returns 1 means it is on time

In [None]:
print(lrmodel.predict([6.0,12.0,594.0]))

In [None]:
print(lrmodel.predict([36.0,12.0,594.0]))

In [None]:
lrmodel.clearThreshold()
print(lrmodel.predict([6.0,12.0,594.0]))
print(lrmodel.predict([36.0,12.0,594.0]))

In [None]:
lrmodel.setThreshold(0.7) 
print(lrmodel.predict([6.0,12.0,594.0]))
print(lrmodel.predict([36.0,12.0,594.0]))

In [None]:
MODEL_FILE='gs://' + BUCKET + '/flights/sparkmloutput/model'
os.system('gsutil -m rm -r ' + MODEL_FILE)

In [None]:
lrmodel.save(sc, MODEL_FILE)
print('{} saved'.format(MODEL_FILE))

In [None]:
lrmodel = 0
print(lrmodel)

In [None]:
from pyspark.mllib.classification import LogisticRegressionModel
lrmodel = LogisticRegressionModel.load(sc, MODEL_FILE)
lrmodel.setThreshold(0.7)

In [None]:
print(lrmodel.predict([36.0,12.0,594.0]))

In [None]:
print(lrmodel.predict([8.0,4.0,594.0]))

In [None]:
lrmodel.clearThreshold() # to make the model produce probabilities
print(lrmodel.predict([20, 10, 500]))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
dist = np.arange(10, 2000, 10)
prob = [lrmodel.predict([20, 10, d]) for d in dist]
sns.set_style("whitegrid")
ax = plt.plot(dist, prob)
plt.xlabel('distance (miles)')
plt.ylabel('probability of ontime arrival')

In [None]:
delay = np.arange(-20, 60, 1)
prob = [lrmodel.predict([d, 10, 500]) for d in delay]
ax = plt.plot(delay, prob)
plt.xlabel('departure delay (minutes)')
plt.ylabel('probability of ontime arrival')

In [None]:
inputs = 'gs://{}/flights/tzcorr/all_flights-*'.format(BUCKET)
flights = spark.read.json(inputs)
flights.createOrReplaceTempView('flights')

testquery = trainquery.replace("t.is_train_day == 'True'","t.is_train_day == 'False'")

In [None]:
testdata = spark.sql(testquery)
examples = testdata.rdd.map(to_example)

In [None]:
testdata.describe().show()

In [None]:
def eval(labelpred):
    ''' 
        data = (label, pred)
            data[0] = label
            data[1] = pred
    '''
    cancel = labelpred.filter(lambda data: data[1] < 0.7)
    nocancel = labelpred.filter(lambda data: data[1] >= 0.7)
    corr_cancel = cancel.filter(lambda data: data[0] == int(data[1] >= 0.7)).count()
    corr_nocancel = nocancel.filter(lambda data: data[0] == int(data[1] >= 0.7)).count()
    
    cancel_denom = cancel.count()
    nocancel_denom = nocancel.count()
    if cancel_denom == 0:
        cancel_denom = 1
    if nocancel_denom == 0:
        nocancel_denom = 1
    return {'total_cancel': cancel.count(), \
            'correct_cancel': float(corr_cancel)/cancel_denom, \
            'total_noncancel': nocancel.count(), \
            'correct_noncancel': float(corr_nocancel)/nocancel_denom \
           }

In [None]:
lrmodel.clearThreshold() # so it returns probabilities
labelpred = examples.map(lambda p: (p.label, lrmodel.predict(p.features)))
print('All flights:')
print(eval(labelpred))

In [None]:
print('Flights near decision threshold:')
labelpred = labelpred.filter(lambda data: data[1] > 0.65 and data[1] < 0.75)
print(eval(labelpred))