<a href="https://colab.research.google.com/github/yash056-tech2004/yash09/blob/main/Airline_delay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#💼 Business Context
Flight delays can significantly affect time-sensitive operations, especially for businesses dependent on air logistics. With thousands of records containing delay types (carrier, weather, NAS, etc.) and cancellation/diversion information, there is a strong opportunity to:

Identify key drivers of delays,

Predict expected delay durations (regression),

Classify flights as "delay-prone" or "on-time" (classification),

Optimize scheduling and contingency planning for logistics operations.



#🧩 Problem Statement
Flight delays disrupt airline operations, affect passenger satisfaction, and cause significant financial losses. This project aims to analyze historical airline delay data to identify key factors contributing to delays, such as weather, carrier issues, and airspace congestion.

The goal is to build machine learning models that:

Predict total delay duration for a flight (regression), and

Classify whether a flight will be delayed  (classification).

These insights will help airlines and partners improve scheduling, reduce disruptions, and enhance operational efficiency.

#Observation
* 1 Importing useful libraries.


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("Airlines.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Airlines.csv'

# Observations:
* The data set has 9 columns.
* The data set has 62949 rows.

In [None]:
df.shape

In [None]:
df.info()

#Observation
* 1 Rows: 62,949 total, 1 missing in most columns.
* 2 Id: Just a unique identifier.
* 3 Flight: Many unique flights (1–7813), high variance.
* 4 DayOfWeek: Values 3–6 only (mid-week data).
* 5 Time: Spread across the day, median around 12:30 PM.
* 6 Length: Flights last 24–655 mins, average ~130 mins.

In [None]:
df.describe()

# Observations:
* Columns id, Airline, Flight, AirportFrom have no null values.
* Columns AirportTo, DayOfWeek, Time, Length, Delay each have 1 null value.

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.duplicated().sum()

# Observations:
* WN airline has the highest number of flights.
* ATL airport has the max number of flights leaving.
* ATL is also the destination for max number of flights.

In [None]:
categorical_column=df.select_dtypes("object").columns
for i in categorical_column:
  print(df[i].value_counts())
  print(".............................")

# Observations:
* Each flight has its own unique id.
* flight 16 has the highest value count.
* Maximum number of flights fly on 5th day of the week.
* Flights with length 80 highest in number.
* The flights having no delay are MORE than the flights having delay.

In [None]:
numerical_column=df.select_dtypes(["int","float"]).columns
for i in numerical_column:
  print(df[i].value_counts())
  print(".............................")

In [None]:
numerical_column=df.select_dtypes(["int","float"]).columns
for i in numerical_column:
  plt.figure(figsize=(5,4))
  sns.boxplot(df[i])
  plt.show()

In [None]:
for i in numerical_column:
  q1=df[i].quantile(0.25)
  q3=df[i].quantile(0.75)
  iqr=q3-q1
  lower_limit=q1-1.5*iqr
  upper_limit=q3+1.5*iqr
  print("LowerLimit:",lower_limit,"  UpperLimit:",upper_limit)
  print("...............................................")

In [None]:
categorical_column=df.select_dtypes("object").columns
for i in categorical_column:
  plt.figure(figsize=(4,4))
  sns.countplot(data=df,x=i)
  plt.xticks(rotation=90)
  plt.show()

# Observations:
* Following bar graphs show WN is has the highest number of flights(over 10k+).
* Most number of flights are leaving from ATL.
* Maximum number of flights have ATL as the destination.
* There are no insights in airport_from column and airport_to column.

In [None]:
df['AirportFrom'].value_counts().keys()[0:10]

In [None]:
airport_from_counts = df['AirportFrom'].value_counts()
top_10_airports = airport_from_counts.head(10)
print("Top 10 AirportFrom counts:")
print(top_10_airports)

In [None]:
plt.bar(df['AirportFrom'].value_counts().keys()[0:10],df['AirportFrom'].value_counts().values[0:10])

In [None]:
for i in numerical_column:
  plt.figure(figsize=(5,3))
  sns.scatterplot(data=df,x=i,y="Time")
  plt.show()
  plt.xticks(rotation=60)
  plt.show()

In [None]:
correlation=df[numerical_column].corr()

# Observations:
* Flight and Time have the lowest positive correlation.
* Flight and Length have lowest correlation among all.
* DayOfWeek and id have the highest correlation.

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(correlation,annot=True)

In [None]:
for i in categorical_column:
  plt.figure(figsize=(5,3))
  sns.barplot(data=df,x=i,y='Time',errorbar=None)
  plt.xticks(rotation=60)
  plt.show()

# Observations:
* Except for WN every Airline has more non-delayed flights than delayed.
* DL has the highest number of non-delayed flights.
* WN has the highest number of delayed flights

In [None]:
for i in categorical_column:
  plt.figure(figsize=(10,6))
  sns.countplot(data=df,x=i,hue='Time')
  plt.xticks(rotation=45)
  plt.show()

In [None]:
df_copy= df.copy()
cat_variables= df_copy.select_dtypes('object')
num_variables= df_copy.select_dtypes(['int','float'])

In [None]:
cat_variables.columns

In [None]:
num_variables.columns

In [None]:
print(" No. of rows before dropping duplicates :", df_copy.shape[0])
df_copy.drop_duplicates(inplace=True)
print(" No. of rows after dropping duplicates :", df_copy.shape[0])

In [None]:
print(" No. of rows before dropping duplicates :", df_copy.shape[0])
df_copy.drop_duplicates(inplace=True)
print(" No. of rows after dropping duplicates :", df_copy.shape[0])

In [None]:
df.isnull().sum()

In [None]:
df_copy.to_csv('df_copy1.csv', index=False)

In [None]:
for i in df_copy.select_dtypes(['int','float']).columns:
    plt.figure(figsize=(5,2))
    sns.boxplot(data=df_copy,x=i);

In [None]:
# A.IQR Method (Interquartile Range)
Q1 = df_copy['Length'].quantile(0.25)
Q3 = df_copy['Length'].quantile(0.75)
IQR = Q3 - Q1
# Define bounds
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
lower,upper

In [None]:
# Filter out outliers
df_copy[(df_copy['Length'] <= lower) | (df_copy['Length'] >= upper)]

In [None]:
df_copy['Length'].quantile(0.99)

In [None]:
df_copy['Length'].describe()

In [None]:
df_copy['Length'] =np.where(df_copy['Length']<= lower,df_copy['Length'].median(),df_copy['Length'])
df_copy['Length'] =np.where(df_copy['Length']>= upper,df_copy['Length'].median(),df_copy['Length'])

In [None]:
 df_copy[(df_copy['Length'] <= lower) | (df_copy['Length'] >= upper)]

In [None]:
df_copy['Length'] = np.where(df_copy['Length'] > 300, df_copy['Length'].median(), df_copy['Length'])

In [None]:
sns.boxplot(data=df_copy, x='Length')
# plt.xticks(np.arange(20,120,50));

#### Separating Features and Target

In [None]:
X = df_copy.drop('Time', axis=1)
y = df_copy['Time']

In [None]:
X.head()

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.2,random_state=20)

In [None]:
pd.get_dummies(cat_variables[['Airline', 'AirportFrom', 'AirportTo']],dtype='int')

In [None]:
num_variables=df_copy.select_dtypes(['int','float'])

In [None]:
# for i in ['Airline', 'AirportFrom', 'AirportTo']:
    # print(pd.get_dummies(cat_variables[i],dtype='int'))

In [None]:
cat_variables.head(2)

In [None]:
cat_variables_encoded= pd.get_dummies(cat_variables, columns=['Airline', 'AirportFrom', 'AirportTo'], dtype=int)

In [None]:
cat_variables_encoded.head()

In [None]:
num_variables


In [None]:
df_encoded = pd.concat([cat_variables_encoded,num_variables], axis=1)

In [None]:
df_encoded.to_csv('df_encoded', index=False)

In [None]:
cat_cols=['Airline', 'AirportFrom', 'AirportTo']
num_cols=['Flight', 'DayOfWeek', 'Time', 'Length']

In [None]:
# 1. Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
MMscaler = MinMaxScaler()
Xtrain_scaled_MinMax = MMscaler.fit_transform(X_train[num_cols])
Xval_scaled_MinMax = MMscaler.transform(X_val[num_cols])

In [None]:
Xtrain_scaled_MinMax=pd.DataFrame(Xtrain_scaled_MinMax,columns=X_train[num_cols].columns)
Xval_scaled_MinMax=pd.DataFrame(Xval_scaled_MinMax,columns=X_val[num_cols].columns)

In [None]:
Xtrain_scaled_MinMax.head(2)

In [None]:
X_train.head(2)

In [None]:
# (30-X_val['Flight'].min())/(X_val['Flight'].max()-X_val['Flight'].min())

In [None]:
Xtrain_scaled_MinMax.describe()

In [None]:
# 2. z-score Scaling (standardization)
from sklearn.preprocessing import StandardScaler
SSscaler = StandardScaler()
Xtrain_scaled_StandardScalar = SSscaler.fit_transform(X_train[num_cols])
Xval_scaled_StandardScalar = SSscaler.transform(X_val[num_cols])

In [None]:
Xtrain_scaled_StandardScalar=pd.DataFrame(Xtrain_scaled_StandardScalar,columns=X_train[num_cols].columns)
Xval_scaled_StandardScalar=pd.DataFrame(Xval_scaled_StandardScalar,columns=X_val[num_cols].columns)

In [None]:
round(Xtrain_scaled_StandardScalar.describe(),2)

In [None]:
X_train[num_cols].head(2)

In [None]:
Xtrain_scaled_StandardScalar.head(2)

In [None]:
(45- X_train['Flight'].mean())/X_train['Flight'].std()

In [None]:
Xtrain_scaled_StandardScalar.shape

In [None]:
X_train[cat_cols].shape

In [None]:
X_train[cat_cols].reset_index(drop=True)

In [None]:
scaled_train=pd.concat([Xtrain_scaled_StandardScalar,X_train[cat_cols].reset_index(drop=True)],axis=1)
scaled_test=pd.concat([Xval_scaled_StandardScalar,X_val[cat_cols].reset_index(drop=True)],axis=1)

In [None]:
# One-hot encode the categorical columns
scaled_train = pd.get_dummies(scaled_train, columns=cat_cols, drop_first=True)
scaled_test = pd.get_dummies(scaled_test, columns=cat_cols, drop_first=True)

# Save the processed dataframes
scaled_train.to_csv('Scaled_data_train',index=False)
scaled_test.to_csv('Scaled_data_val',index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
# pip install xgboost
from xgboost import XGBRegressor

In [None]:
# loading the encoded data
df_TrainTestSplit= pd.read_csv('df_encoded')

#seperating the target variable from rest of the data
X = df_TrainTestSplit.drop('Time', axis=1)
y = df_TrainTestSplit['Time']

# Splitting the data into train & validation set
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.2,random_state=20)

# Lodaing the scaled and encoded data sets
x_trainFinal= pd.read_csv('Scaled_data_train')
x_valFinal= pd.read_csv('Scaled_data_val')

# Ensure columns match after one-hot encoding
# This is important if some categories are only in train or test set
train_cols = x_trainFinal.columns
test_cols = x_valFinal.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    x_valFinal[c] = 0
# Ensure the order of columns is the same
x_valFinal = x_valFinal[train_cols]

In [None]:
import numpy as np

# Impute missing values with the mean
x_trainFinal = x_trainFinal.fillna(x_trainFinal.mean())
x_valFinal = x_valFinal.fillna(x_valFinal.mean())
y_train = y_train.fillna(y_train.mean())
y_val = y_val.fillna(y_val.mean())

# Building a Linear regression model
LR = LinearRegression()
LR.fit(x_trainFinal,y_train)
y_train_pred = LR.predict(x_trainFinal)
y_val_pred = LR.predict(x_valFinal)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
r2_train = r2_score(y_train, y_train_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2_val = r2_score(y_val, y_val_pred)
print("Train RMSE",rmse_train,"| Train R2",r2_train)
print('Test RMSE',rmse_val,'| Test R2',r2_val)