In [1]:
# Importing the required libraries.
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.models import load_model

In [2]:
# Reading the cleaned and processed data file.
df = pd.read_csv("./cleaned_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,date,airline,flight_code,source_city,time_taken,stop,destinate_city,price,Class,dep_time_category,arr_time_category
0,0,2022-02-11,SpiceJet,SG-8709,Delhi,130,non-stop,Mumbai,5953,Economy,Evening,Night
1,1,2022-02-11,SpiceJet,SG-8157,Delhi,140,non-stop,Mumbai,5953,Economy,Early Morning,Morning
2,2,2022-02-11,AirAsia,I5-764,Delhi,130,non-stop,Mumbai,5956,Economy,Early Morning,Early Morning
3,3,2022-02-11,Vistara,UK-995,Delhi,135,non-stop,Mumbai,5955,Economy,Morning,Afternoon
4,4,2022-02-11,Vistara,UK-963,Delhi,140,non-stop,Mumbai,5955,Economy,Morning,Morning


In [3]:
# Removing the first column as this column is not useful for us in further processing.
df = df.drop(columns=df.columns[0], axis=1)
df.head()

Unnamed: 0,date,airline,flight_code,source_city,time_taken,stop,destinate_city,price,Class,dep_time_category,arr_time_category
0,2022-02-11,SpiceJet,SG-8709,Delhi,130,non-stop,Mumbai,5953,Economy,Evening,Night
1,2022-02-11,SpiceJet,SG-8157,Delhi,140,non-stop,Mumbai,5953,Economy,Early Morning,Morning
2,2022-02-11,AirAsia,I5-764,Delhi,130,non-stop,Mumbai,5956,Economy,Early Morning,Early Morning
3,2022-02-11,Vistara,UK-995,Delhi,135,non-stop,Mumbai,5955,Economy,Morning,Afternoon
4,2022-02-11,Vistara,UK-963,Delhi,140,non-stop,Mumbai,5955,Economy,Morning,Morning


### LSTM (Long-Short Time Memory)

The reason we chose LSTM is because the LSTM network is designed to capture and remember long-term dependencies in sequential data (https://colah.github.io/posts/2015-08-Understanding-LSTMs/).

Our project aims to predict future airfare, meaning that LSTM networks can learn complex patterns and relationships in historical price data. Airfare in ticket markets exhibits sequential dependencies, where the value at any given time is influenced by previous values. LSTM networks are inherently suited to handle sequential data due to their recurrent nature, allowing them to process data points in sequence while retaining memory of past observations.

Since LSTM networks expect input data to be in the form of sequences, each feature would be represented as a sequence of values over time. In this case, we will choose only the date as input.

In [4]:
# Preprocessing the dataset to make it compatible with LSTM input requirements.
df.set_index('date', inplace=True)
df.sort_index(inplace=True)

new_df = pd.DataFrame({'date': df.index, 'price': df['price']}).reset_index(drop=True)

In [5]:
# Normalizing the dataset.
database = new_df.values

# Dividing the dataset into Training and Testing data.
train_data = database[0:240000, :]
valid_data = database[240000:, :]

new_df.index = new_df.date
new_df.drop('date', axis=1, inplace=True)

# Using the MinMaxScaler to scale and normalise the values.
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(database[:, 1:])

x_train_data, y_train_data = [], []

# Can change it to 60.
for i in range(100, len(train_data)):
    x_train_data.append(scaled_data[i-100:i, 0])
    y_train_data.append(scaled_data[i, 0])

x_train_data, y_train_data = np.array(x_train_data), np.array(y_train_data)
x_train_data=np.reshape(x_train_data, (x_train_data.shape[0], x_train_data.shape[1], 1))
y_train_data = np.array(y_train_data)

In [6]:
# Building the LSTM model
LSTM_model = Sequential()
LSTM_model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train_data.shape[1],1)))
LSTM_model.add(LSTM(units=50))
LSTM_model.add(Dense(1))

input_data = new_df[len(new_df) - len(valid_data)-100:].values
input_data = scaler.transform(input_data)

LSTM_model.compile(loss = 'mean_squared_error', optimizer='adam')
LSTM_model.fit(x_train_data, y_train_data, epochs=10, batch_size=64, verbose=1)

Epoch 1/10


  super().__init__(**kwargs)


[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 33ms/step - loss: 0.0136
Epoch 2/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 72ms/step - loss: 0.0110
Epoch 3/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 34ms/step - loss: 0.0112
Epoch 4/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 34ms/step - loss: 0.0111
Epoch 5/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 34ms/step - loss: 0.0109
Epoch 6/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 34ms/step - loss: 0.0109
Epoch 7/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 34ms/step - loss: 0.0109
Epoch 8/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 34ms/step - loss: 0.0108
Epoch 9/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 34ms/step - loss: 0.0107
Epoch 10/10
[1m3749/3749[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x288533b10>

In [7]:
# Testing the model
x_test = []
y_test = new_df['price'][240000:]
for i in range(100, input_data.shape[0]):
    x_test.append(input_data[i-100:i, 0])

x_test = np.array(x_test)

x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

y_pred = LSTM_model.predict(x_test)
y_pred = scaler.inverse_transform(y_pred)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Evaluating our model.
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test,y_pred))

[1m1842/1842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step
MAE: 4404.384601680672
MSE: 117265391.00660688
RMSE: 10828.914581185267
R2: 0.7736096132887911
