In [18]:
# CELL 1: SETUP
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, dayofweek

spark = SparkSession.builder.appName("FlightPulse-ML").getOrCreate()

df = spark.read.csv("flights.csv", header=True, inferSchema=True)

df_transformed = df.withColumn(
    "flight_date_formatted", 
    to_date(col("FLIGHT_DATE"), "dd-MM-yyyy")
).withColumn(
    "day_of_week",
    dayofweek(col("flight_date_formatted"))
)
print("Setup Complete: 'df_transformed' is ready.")

Setup Complete: 'df_transformed' is ready.


In [19]:
# CELL 2 (FINAL CORRECTED Version): FEATURE ENGINEERING
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

df_with_label = df_transformed.withColumn("label", (col("HAUL") == "LONG").cast("integer"))

# We are removing ALL the perfect predictors to create a real challenge.
categorical_cols = ["TIME_OF_DAY"] 
numerical_cols = ["FIRST_CLASS_SEATS", "BUSINESS_CLASS_SEATS", "ECONOMY_SEATS", "day_of_week"]

# --- THIS IS THE CORRECTED PART ---
# The indexers stage remains the same
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_index", handleInvalid="keep") for c in categorical_cols]

# This line is now corrected to properly loop through the categorical_cols list.
assembler_inputs = [f"{c}_index" for c in categorical_cols] + numerical_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

pipeline = Pipeline(stages=indexers + [assembler])
ml_ready_data = pipeline.fit(df_with_label).transform(df_with_label)

print("Feature Engineering Complete: 'ml_ready_data' is ready for training.")
ml_ready_data.select("HAUL", "label", "features").show(5, truncate=False)

Feature Engineering Complete: 'ml_ready_data' is ready for training.
+-----+-----+------------------------+
|HAUL |label|features                |
+-----+-----+------------------------+
|LONG |1    |[2.0,8.0,49.0,178.0,3.0]|
|LONG |1    |[0.0,8.0,49.0,178.0,3.0]|
|SHORT|0    |[2.0,0.0,17.0,163.0,2.0]|
|SHORT|0    |[1.0,0.0,8.0,172.0,1.0] |
|SHORT|0    |[1.0,0.0,13.0,167.0,2.0]|
+-----+-----+------------------------+
only showing top 5 rows



In [20]:
# Import the Logistic Regression model from Spark's ML library
from pyspark.ml.classification import LogisticRegression

# --- 1. Split the Data ---
# It's a standard and crucial practice in machine learning to split your data into two sets:
# - A 'training' set (which we use to teach the model)
# - A 'testing' set (which we use to evaluate how well the model learned, on data it has never seen before)
# We'll use an 80/20 split.
(training_data, testing_data) = ml_ready_data.randomSplit([0.8, 0.2], seed=42)

print("Data has been split into training and testing sets.")
print(f"Training set count: {training_data.count()}")
print(f"Testing set count: {testing_data.count()}")


# --- 2. Create and Train the Model ---
# Create an instance of the Logistic Regression algorithm
lr = LogisticRegression(featuresCol='features', labelCol='label')

# Train the model by fitting it to our training data
print("\nTraining the Logistic Regression model...")
lr_model = lr.fit(training_data)
print("Model training complete!")


# --- 3. Make Predictions ---
# Now, let's see how our trained model performs on the 'testing' data it has never seen
print("\nMaking predictions on the testing data...")
predictions = lr_model.transform(testing_data)

# Show a few key columns from the predictions DataFrame
# - 'label' is the actual, true answer.
# - 'prediction' is what our model guessed.
predictions.select("label", "prediction", "probability").show(10)

Data has been split into training and testing sets.
Training set count: 8079
Testing set count: 1921

Training the Logistic Regression model...
Model training complete!

Making predictions on the testing data...
+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|    0|       0.0|           [1.0,0.0]|
|    1|       1.0|[5.88202104098990...|
|    0|       0.0|[0.99999999994785...|
|    0|       0.0|[0.99999995503898...|
|    1|       1.0|[5.88202104098990...|
|    1|       1.0|[6.93720081605577...|
|    1|       1.0|[6.93720081605577...|
|    0|       0.0|           [1.0,0.0]|
|    0|       0.0|           [1.0,0.0]|
|    1|       1.0|[4.89317936678709...|
+-----+----------+--------------------+
only showing top 10 rows



In [21]:
# --- 1. A Deeper Look: The Confusion Matrix ---
# A confusion matrix is a table that gives us a more detailed breakdown of our model's performance.
print("Confusion Matrix:")
confusion_matrix = predictions.groupBy("label", "prediction").count()
confusion_matrix.show()


# --- 2. Calculate Accuracy Manually ---
# We can calculate accuracy from the confusion matrix.
# It's (Correct Predictions) / (Total Predictions)

# Correct predictions are where label == prediction
correct_predictions = predictions.filter("label == prediction").count()

# Total predictions is just the total number of rows in our testing set
total_predictions = predictions.count()

# Calculate accuracy
accuracy = correct_predictions / total_predictions

print("\n--- Logistic Regression Model Performance ---")
print(f"Total Predictions: {total_predictions}")
print(f"Correct Predictions: {correct_predictions}")
print(f"Accuracy: {accuracy * 100:.2f}%")

Confusion Matrix:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0| 1163|
|    1|       1.0|  758|
+-----+----------+-----+


--- Logistic Regression Model Performance ---
Total Predictions: 1921
Correct Predictions: 1921
Accuracy: 100.00%
