In [2]:
# ===================================================================
# === FINAL DEFINITIVE SCRIPT (VERSION 3 - CORRECTED) =============
# ===================================================================

# === Imports ===
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, dayofweek
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression

# === Step 1: Create Spark Session ===
spark = SparkSession.builder.appName("FlightPulse-Final-ML-V3").getOrCreate()
print("--- SparkSession Created ---")


# === Step 2: Load and Prepare Base DataFrame ===
df = spark.read.csv("flights.csv", header=True, inferSchema=True)
df_transformed = df.withColumn(
    "flight_date_formatted", 
    to_date(col("FLIGHT_DATE"), "dd-MM-yyyy")
).withColumn(
    "day_of_week",
    dayofweek(col("flight_date_formatted"))
)
df_transformed.unpersist()
print("--- Base DataFrame Prepared and Cache Cleared ---")


# === Step 3: Final Feature Engineering (DEFINITIVELY NO LEAKS) ===
df_with_label = df_transformed.withColumn("label", (col("HAUL") == "LONG").cast("integer"))

categorical_cols = ["TIME_OF_DAY"] 
numerical_cols = ["day_of_week"]

# --- THIS IS THE CORRECTED PART ---
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_index", handleInvalid="keep") for c in categorical_cols]
assembler_inputs = [f"{c}_index" for c in categorical_cols] + numerical_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

pipeline = Pipeline(stages=indexers + [assembler])
ml_ready_data = pipeline.fit(df_with_label).transform(df_with_label)
print("--- Feature Engineering Complete ---")


# === Step 4: Split, Train, and Predict ===
(training_data, testing_data) = ml_ready_data.randomSplit([0.8, 0.2], seed=42)

lr = LogisticRegression(featuresCol='features', labelCol='label')
lr_model = lr.fit(training_data)
predictions = lr_model.transform(testing_data)
print("--- Model Trained and Predictions Made ---")


# === Step 5: Final Evaluation ===
correct_predictions = predictions.filter("label == prediction").count()
total_predictions = predictions.count()

if total_predictions > 0:
    accuracy = correct_predictions / total_predictions
else:
    accuracy = 0

print("\n--- Final, Realistic Model Performance ---")
print("Confusion Matrix:")
predictions.groupBy("label", "prediction").count().show()
print(f"Accuracy: {accuracy * 100:.2f}%")

--- SparkSession Created ---
--- Base DataFrame Prepared and Cache Cleared ---
--- Feature Engineering Complete ---
--- Model Trained and Predictions Made ---

--- Final, Realistic Model Performance ---
Confusion Matrix:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  758|
|    0|       0.0| 1163|
+-----+----------+-----+

Accuracy: 60.54%


In [3]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier

# We will reuse our 'training_data' and 'testing_data' from before.
total_predictions = testing_data.count()

# --- Train a Decision Tree Model ---
print("--- Training Decision Tree Model ---")
dt = DecisionTreeClassifier(featuresCol='features', labelCol='label')
dt_model = dt.fit(training_data)
dt_predictions = dt_model.transform(testing_data)

dt_correct_predictions = dt_predictions.filter("label == prediction").count()
dt_accuracy = dt_correct_predictions / total_predictions if total_predictions > 0 else 0

print(f"Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")
print("Decision Tree Confusion Matrix:")
dt_predictions.groupBy("label", "prediction").count().show()


# --- Train a Random Forest Model ---
print("\n--- Training Random Forest Model ---")
rf = RandomForestClassifier(featuresCol='features', labelCol='label')
rf_model = rf.fit(training_data)
rf_predictions = rf_model.transform(testing_data)

rf_correct_predictions = rf_predictions.filter("label == prediction").count()
rf_accuracy = rf_correct_predictions / total_predictions if total_predictions > 0 else 0

print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Random Forest Confusion Matrix:")
rf_predictions.groupBy("label", "prediction").count().show()

--- Training Decision Tree Model ---
Decision Tree Accuracy: 60.54%
Decision Tree Confusion Matrix:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  758|
|    0|       0.0| 1163|
+-----+----------+-----+


--- Training Random Forest Model ---
Random Forest Accuracy: 60.54%
Random Forest Confusion Matrix:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  758|
|    0|       0.0| 1163|
+-----+----------+-----+



In [7]:
from pyspark.sql.functions import avg, concat_ws, col

# --- 1. Engineer a New Feature: Average Seats per Route ---

# First, create a 'route' and 'total_seats' column on our main df_transformed DataFrame
df_with_seats = df_transformed.withColumn(
    "route", 
    concat_ws("-", col("DEPARTURE_STATION_CD"), col("ARRIVAL_STATION_CD"))
).withColumn(
    "total_seats", 
    col("FIRST_CLASS_SEATS") + col("BUSINESS_CLASS_SEATS") + col("ECONOMY_SEATS")
)

# Now, calculate the average seats for each route from this new DataFrame
avg_seats_per_route = df_with_seats.groupBy("route").agg(avg("total_seats").alias("avg_seats_for_route"))

# --- THIS IS THE CORRECTED PART ---
# Perform a simpler, more robust join using the 'route' column which now exists in both DataFrames
df_with_new_feature = df_with_seats.join(avg_seats_per_route, on="route")

# Create the label for prediction
df_with_label_final = df_with_new_feature.withColumn("label", (col("HAUL") == "LONG").cast("integer"))


# --- 2. Retrain the Models with the New Feature ---
categorical_cols = ["TIME_OF_DAY"] 
numerical_cols = ["day_of_week", "avg_seats_for_route"] # <-- NEW FEATURE ADDED

indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_index", handleInvalid="keep") for c in categorical_cols]
assembler_inputs = [f"{c}_index" for c in categorical_cols] + numerical_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

pipeline = Pipeline(stages=indexers + [assembler])
ml_ready_data_new = pipeline.fit(df_with_label_final).transform(df_with_label_final)

# Split the new data and retrain our Random Forest model
(training_data_new, testing_data_new) = ml_ready_data_new.randomSplit([0.8, 0.2], seed=42)
total_predictions_new = testing_data_new.count()

rf_new = RandomForestClassifier(featuresCol='features', labelCol='label')
rf_model_new = rf_new.fit(training_data_new)
rf_predictions_new = rf_model_new.transform(testing_data_new)

rf_correct_new = rf_predictions_new.filter("label == prediction").count()
rf_accuracy_new = rf_correct_new / total_predictions_new if total_predictions_new > 0 else 0

print("--- Random Forest Performance with NEW Feature ---")
print(f"New Accuracy: {rf_accuracy_new * 100:.2f}%")
print("New Confusion Matrix:")
rf_predictions_new.groupBy("label", "prediction").count().show()

--- Random Forest Performance with NEW Feature ---
New Accuracy: 100.00%
New Confusion Matrix:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0| 1116|
|    1|       1.0|  805|
+-----+----------+-----+

