# Importing Necessary packages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import time

# Loading the dataset

In [2]:
df = pd.read_csv("C:/Users/vamsi/Documents/Datasets/flights.csv", low_memory=False)

# Removing duplicate values and dropping irrelavent attributes and columns

In [3]:
df = df.drop_duplicates()
df = df.drop(['YEAR', 'DAY', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'TAIL_NUMBER'], axis=1)

In [4]:
df['DELAYED'] = (df['ARRIVAL_DELAY'] > 10).astype(int)

# Defining label encoder for encoding categorical variables

In [5]:
label_encoder = LabelEncoder()
df['AIRLINE'] = label_encoder.fit_transform(df['AIRLINE'])
df['ORIGIN_AIRPORT'] = label_encoder.fit_transform(df['ORIGIN_AIRPORT'])
df['DESTINATION_AIRPORT'] = label_encoder.fit_transform(df['DESTINATION_AIRPORT'])

In [6]:
df = df.fillna(df.mean())

# Taking fraction of dataset and drop further attributes which aren't required for classification

In [7]:
df = df.sample(frac=0.1, random_state=42)

In [8]:
X = df.drop(['ARRIVAL_DELAY', 'DELAYED'], axis=1)
y = df['DELAYED']

# Defining Random Forest Classifier with the help of GridSearchCV and choosing best estimators

In [9]:
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 20]}
classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(classifier, param_grid=param_grid, cv=3, n_jobs=-1)
start_time = time.time()
grid_search.fit(X, y)
print(f"Training time: {time.time() - start_time:.2f}s")

Training time: 379.51s


# Training the model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
classifier = grid_search.best_estimator_
classifier.fit(X_train, y_train)

# Evaluating the model

In [12]:
accuracy = classifier.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.95


# Exporting the values to a file dataset named 'test_results.csv' and saving the model for future use

In [13]:
y_pred = classifier.predict(X_test)
results_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred})
results_df.to_csv("test_results.csv", index=False)

In [14]:
import joblib

joblib.dump(classifier, "random_forest_model.pkl")

['random_forest_model.pkl']