In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
spark = SparkSession \
    .builder \
    .appName("Airline Flight Delay Classification using Logistic Regression ") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
# read preprocessed parquet file.
scaledData=spark.read.parquet("airline_dataset_processed.parquet")


In [3]:
scaledData.select("scaledFeatures").show(10)

+--------------------+
|      scaledFeatures|
+--------------------+
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
+--------------------+
only showing top 10 rows



In [4]:
scaledData.select("scaledFeatures").tail(10)

[Row(scaledFeatures=DenseVector([0.1371, 0.0214, 0.2, 1.0, 0.8182])),
 Row(scaledFeatures=DenseVector([0.0334, 0.0385, 0.2, 0.3333, 0.0])),
 Row(scaledFeatures=DenseVector([0.6828, 0.0769, 0.2, 0.6667, 0.6364])),
 Row(scaledFeatures=DenseVector([0.1579, 0.0, 0.0, 0.1667, 0.2727])),
 Row(scaledFeatures=DenseVector([0.7578, 0.2179, 0.2, 0.1667, 0.0909])),
 Row(scaledFeatures=DenseVector([0.6793, 0.1154, 0.6, 0.0, 0.9091])),
 Row(scaledFeatures=DenseVector([0.1404, 0.1068, 0.8, 0.0, 0.4545])),
 Row(scaledFeatures=DenseVector([0.2054, 0.1111, 0.6, 1.0, 0.5455])),
 Row(scaledFeatures=DenseVector([0.5497, 0.3291, 0.8, 0.8333, 0.4545])),
 Row(scaledFeatures=DenseVector([0.7508, 0.0299, 0.6, 1.0, 0.8182]))]

In [5]:

(trainingData, testData) = scaledData.randomSplit([0.7, 0.3])
lg = LogisticRegression(labelCol="Flight Status", featuresCol="scaledFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[lg])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [6]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="Flight Status", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.33034 


In [7]:
print("Model Accuracy is : ", accuracy)

Model Accuracy is :  0.66966014555652
