In [22]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Import the dataset
data = pd.read_csv("flight_delay.csv")

# Feature selection
X = data[['ORIGIN_AIRPORT_ID', 'ORIGIN', 'DEST', 'DEP_TIME']]
y = data['DEP_DEL15']

# Encode categorical features
encoder_origin = LabelEncoder()
encoder_dest = LabelEncoder()
X['ORIGIN'] = encoder_origin.fit_transform(X['ORIGIN'])
X['DEST'] = encoder_dest.fit_transform(X['DEST'])

# Handle missing values
X['DEP_TIME'].fillna(X['DEP_TIME'].mean(), inplace=True)
y.fillna(0, inplace=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

# Evaluate Random Forest
print("=== Random Forest Results ===")
print("Accuracy with Random Forest:", accuracy_score(y_test, rf_y_pred))
print("Confusion Matrix (RF):\n", confusion_matrix(y_test, rf_y_pred))
print("Classification Report (RF):\n", classification_report(y_test, rf_y_pred))

# XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)

# Evaluate XGBoost
print("\n=== XGBoost Results ===")
print("Accuracy with XGBoost:", accuracy_score(y_test, xgb_y_pred))
print("Confusion Matrix (XGB):\n", confusion_matrix(y_test, xgb_y_pred))
print("Classification Report (XGB):\n", classification_report(y_test, xgb_y_pred))

# Function to add unseen categories to the encoder
def add_unseen_category(encoder, value):
    if value not in encoder.classes_:
        encoder.classes_ = np.append(encoder.classes_, value)

# Function to predict using both models and compare
def predict_flight_delay():
    # User inputs
    print("\nEnter Flight Details for Prediction:")
    origin = input("Enter Origin Airport Code (e.g., SFO): ").strip()
    dest = input("Enter Destination Airport Code (e.g., LAX): ").strip()
    dep_time = float(input("Enter Departure Time (in minutes, e.g., 1530): "))
    origin_airport_id = int(input("Enter Origin Airport ID (e.g., 12345): "))
    
    # Add unseen values to the encoder
    add_unseen_category(encoder_origin, origin)
    add_unseen_category(encoder_dest, dest)
    
    # Encode input features
    origin_encoded = encoder_origin.transform([origin])[0]
    dest_encoded = encoder_dest.transform([dest])[0]
    dep_time_filled = dep_time if dep_time is not None else X['DEP_TIME'].mean()
    
    # Create input data
    input_data = pd.DataFrame({
        'ORIGIN_AIRPORT_ID': [origin_airport_id],
        'ORIGIN': [origin_encoded],
        'DEST': [dest_encoded],
        'DEP_TIME': [dep_time_filled]
    })
    
    # Predict with Random Forest
    rf_prediction = rf_model.predict(input_data)[0]
    rf_result = "Delayed" if rf_prediction == 1 else "Not Delayed"
    
    # Predict with XGBoost
    xgb_prediction = xgb_model.predict(input_data)[0]
    xgb_result = "Delayed" if xgb_prediction == 1 else "Not Delayed"
    
    # Print predictions
    print("\n=== Prediction Results ===")
    print(f"Random Forest Prediction: {rf_result}")
    print(f"XGBoost Prediction: {xgb_result}")

# Run the prediction function
predict_flight_delay()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ORIGIN'] = encoder_origin.fit_transform(X['ORIGIN'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['DEST'] = encoder_dest.fit_transform(X['DEST'])
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the o

=== Random Forest Results ===
Accuracy with Random Forest: 0.8928482752125483
Confusion Matrix (RF):
 [[93346  3642]
 [ 8873 10936]]
Classification Report (RF):
               precision    recall  f1-score   support

         0.0       0.91      0.96      0.94     96988
         1.0       0.75      0.55      0.64     19809

    accuracy                           0.89    116797
   macro avg       0.83      0.76      0.79    116797
weighted avg       0.89      0.89      0.89    116797


=== XGBoost Results ===
Accuracy with XGBoost: 0.8416397681447297
Confusion Matrix (XGB):
 [[96254   734]
 [17762  2047]]
Classification Report (XGB):
               precision    recall  f1-score   support

         0.0       0.84      0.99      0.91     96988
         1.0       0.74      0.10      0.18     19809

    accuracy                           0.84    116797
   macro avg       0.79      0.55      0.55    116797
weighted avg       0.83      0.84      0.79    116797


Enter Flight Details for Predi

Enter Origin Airport Code (e.g., SFO):  LRW
Enter Destination Airport Code (e.g., LAX):  MLP
Enter Departure Time (in minutes, e.g., 1530):  2503
Enter Origin Airport ID (e.g., 12345):  12654



=== Prediction Results ===
Random Forest Prediction: Delayed
XGBoost Prediction: Delayed
