In [1]:
# Supress warnings:
import warnings
warnings.filterwarnings("ignore")

In [2]:
#loading data
import pandas as pd
from pathlib import Path

#import NumPy library
import numpy as np

file_path = Path("airfareData/Data_Train.xlsx")
airfare = pd.read_excel(file_path)

airfare.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [3]:
#The dependent variable for this dataset is the airfare price, which is the variable we want to predict.

# Is 'Route' information is already captured by the 'Total_Stops' variable??

columns_to_drop = ['Source', 'Destination','Additional_Info','Route']

airfare.drop(columns=columns_to_drop, axis=1, inplace=True)
airfare.head()

Unnamed: 0,Airline,Date_of_Journey,Dep_Time,Arrival_Time,Duration,Total_Stops,Price
0,IndiGo,24/03/2019,22:20,01:10 22 Mar,2h 50m,non-stop,3897
1,Air India,1/05/2019,05:50,13:15,7h 25m,2 stops,7662
2,Jet Airways,9/06/2019,09:25,04:25 10 Jun,19h,2 stops,13882
3,IndiGo,12/05/2019,18:05,23:30,5h 25m,1 stop,6218
4,IndiGo,01/03/2019,16:50,21:35,4h 45m,1 stop,13302


In [4]:
#replacing categories with numerical labels for algorithms
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
airfare = airfare.apply(le.fit_transform)


#step 7: checking for null values
airfare.isnull().sum()

Airline            0
Date_of_Journey    0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Price              0
dtype: int64

In [5]:
#data partition by PRICE (Dependent variable)
X = airfare.drop(['Price'], axis=1)
Y = airfare['Price']

# Taking 80% of House data as training set, and remaining 20% as test set.
X_train = np.array(X[0:int(0.80*len(X))])
Y_train = np.array(Y[0:int(0.80*len(Y))])
X_test = np.array(X[int(0.80*len(X)):])
Y_test = np.array(Y[int(0.80*len(Y)):])
len(X_train), len(Y_train), len(X_test), len(Y_test)

print("The size of training input is", X_train.shape)
print("The size of training output is", Y_train.shape)
print(50 *'*')
print("The size of testing input is", X_test.shape)
print("The size of testing output is", Y_test.shape)

The size of training input is (8546, 6)
The size of training output is (8546,)
**************************************************
The size of testing input is (2137, 6)
The size of testing output is (2137,)


In [6]:
#measuring initial execution time
import time
start_time = time.time()


#importing algorithms
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#initializing variables
LSVM = LinearSVC()
NLSVM = SVC(kernel='rbf')
RF = RandomForestClassifier()

# Training data on Data Set
LSVM_fit = LSVM.fit(X_train, Y_train)
NLSVM_fit = NLSVM.fit(X_train, Y_train)
RF_fit = RF.fit(X_train, Y_train)


# Predicting on test data set
LSVM_pred = LSVM_fit.predict(X_test)
NLSVM_pred = NLSVM_fit.predict(X_test)
RF_pred = RF_fit.predict(X_test)

#printing accuracy of the prediction
from sklearn.metrics import accuracy_score

print("Linear SVMs is %f percent accurate" % (accuracy_score(LSVM_pred, Y_test)*100))
print("Non Linear SVMs is %f percent accurate" % (accuracy_score(NLSVM_pred, Y_test)*100))
print("Random Forests is %f percent accurate" % (accuracy_score(RF_pred, Y_test)*100))

#measuring final execution time
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_minutes = elapsed_time / 60

print("Elapsed time: %.2f minutes" % elapsed_minutes)

Linear SVMs is 0.046795 percent accurate
Non Linear SVMs is 3.977539 percent accurate
Random Forests is 25.409453 percent accurate
Elapsed time: 3.53 minutes


In [7]:
#measuring initial execution time
import time
start_time = time.time()


# CROSS VALIDATION K FOLD LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
seed=0
cv = KFold(n_splits=5,random_state=2, shuffle=True)

def return_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

scores=[]

model = LinearSVC()
for train_index, test_index in cv.split(X,Y):
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
    score = return_score(model,X_train, X_test, y_train, y_test)
    scores.append(score)
print("Accuracy score in each iteration: {}".format(scores))
print("K-Fold Score: {}".format(np.mean(scores)))


# CROSS VALIDATION StratifiedKFold LinearSVC()
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5,random_state=2, shuffle=True)

def return_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

scores=[]

#printingStratefiedKFold
model = LinearSVC()
for train_index, test_index in cv.split(X,Y):
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
    score = return_score(model,X_train, X_test, y_train, y_test)
    scores.append(score)
print("Accuracy score in each iteration: {}".format(scores))
print("Stratified K-Fold Score: {}".format(np.mean(scores)))

#measuring final execution time
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_minutes = elapsed_time / 60

print("Elapsed time: %.2f minutes" % elapsed_minutes)

Accuracy score in each iteration: [0.0009358914365933552, 0.0028076743097800653, 0.0009358914365933552, 0.0014038371548900327, 0.0009358914365933552]
K-Fold Score: 0.0014038371548900329
Accuracy score in each iteration: [0.0004679457182966776, 0.0004679457182966776, 0.0014038371548900327, 0.0004679457182966776, 0.0018717828731867104]
Stratified K-Fold Score: 0.0009358914365933552
Elapsed time: 5.61 minutes


In [8]:
#measuring initial execution time
import time
start_time = time.time()


# CROSS VALIDATION K FOLD SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
seed=0
cv = KFold(n_splits=5,random_state=2, shuffle=True)

def return_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

scores=[]

model = SVC(kernel='rbf')
for train_index, test_index in cv.split(X,Y):
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
    score = return_score(model,X_train, X_test, y_train, y_test)
    scores.append(score)
print("Accuracy score in each iteration: {}".format(scores))
print("K-Fold Score: {}".format(np.mean(scores)))


# CROSS VALIDATION StratifiedKFold SVC(kernel='rbf')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5,random_state=2, shuffle=True)

def return_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

scores=[]

#printingStratefiedKFold
model = SVC(kernel='rbf')
for train_index, test_index in cv.split(X,Y):
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
    score = return_score(model,X_train, X_test, y_train, y_test)
    scores.append(score)
print("Accuracy score in each iteration: {}".format(scores))
print("Stratified K-Fold Score: {}".format(np.mean(scores)))

#measuring final execution time
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_minutes = elapsed_time / 60

print("Elapsed time: %.2f minutes" % elapsed_minutes)

Accuracy score in each iteration: [0.035563874590547495, 0.04211511464670098, 0.039307440336920914, 0.04211511464670098, 0.039307440336920914]
K-Fold Score: 0.03968179691155825
Accuracy score in each iteration: [0.034160037435657466, 0.03883949461862424, 0.03509592887225082, 0.035563874590547495, 0.031820308844174076]
Stratified K-Fold Score: 0.03509592887225082
Elapsed time: 19.63 minutes


In [9]:
#measuring initial execution time
import time
start_time = time.time()

# CROSS VALIDATION K FOLD RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
seed=0
cv = KFold(n_splits=5,random_state=2, shuffle=True)

def return_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

scores=[]

#printing Kfold
model = RandomForestClassifier()
for train_index, test_index in cv.split(X,Y):
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
    score = return_score(model,X_train, X_test, y_train, y_test)
    scores.append(score)
print("Accuracy score in each iteration: {}".format(scores))
print("K-Fold Score: {}".format(np.mean(scores)))


# CROSS VALIDATION StratifiedKFold RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5,random_state=2, shuffle=True)

def return_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

scores=[]

#printingStratefiedKFold
model = RandomForestClassifier()
for train_index, test_index in cv.split(X,Y):
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
    score = return_score(model,X_train, X_test, y_train, y_test)
    scores.append(score)
print("Accuracy score in each iteration: {}".format(scores))
print("Stratified K-Fold Score: {}".format(np.mean(scores)))

#measuring final execution time
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_minutes = elapsed_time / 60

print("Elapsed time: %.2f minutes" % elapsed_minutes)

Accuracy score in each iteration: [0.2419279363593823, 0.2274216190921853, 0.24145999064108564, 0.24520355638745905, 0.24941506785212916]
K-Fold Score: 0.2410856340664483
Accuracy score in each iteration: [0.23958820776789894, 0.2526906878802059, 0.24520355638745905, 0.2545624707533926, 0.24941506785212916]
Stratified K-Fold Score: 0.24829199812821715
Elapsed time: 3.76 minutes


In [10]:
#REFLECTION

# The dataset contained information about airline ticket prices along with various attributes such as the date of journey, source, destination, and other features that could potentially influence the price of the ticket.
#
# Here are some insights and observations from this dataset:
#
# The dataset contains 10683 rows and 11 columns.

# The 'Airline' column has the highest number of unique values (12), followed by 'Source' and 'Destination' with 5 and 6 unique values, respectively.

# The majority of the flights in the dataset have a 'Route' with a single stop.

# After dropping columns "unwanted" columns, the dataset did not contain null values

# The 'Price' column is the target variable that needs to be predicted in a machine learning model.

# Columns such as 'Airline', 'Source', 'Destination', and 'Total_Stops', have a strong correlation with the ticket price.

# Also there is a significant variation in ticket prices, with the minimum price being around 1750 rupees and the maximum price being over 60,000 rupees.



# In terms of modeling, both SVM and RandomForest algorithms were used to predict the ticket price. Of course, the accuracy of the model depends on various factors, such as the quality of the data, the chosen algorithm...amongst others.


# Based on the outcomes, it seems like there is an issue with the accuracy scores for the SVM models. The accuracy score of 3.9775 percent or 0.0467 percent is HIGHLY unlikely and probably indicates a potential problem with the model (or data).
#
# Assuming that the accuracy scores for the SVM models are not accurate and highly unlikely, we can compare the accuracy scores for the Random Forest models. The Random Forest model has an accuracy score of 25.409453 percent, indicating that it has the highest accuracy among the three models. Based on the accuracy scores, we see that the random forest model's performance varies slightly across different iterations. Also, the K-Fold score and Stratified K-Fold score are quite similar,which indicates that the model is performing consistently across different folds and is not overfitting to any particular set of data. Overall, I think it is important to note that an accuracy score of 25.409453 percent is relatively low and may indicate that the models need further improvement or require a different model (or that maybe the dataset is difficult to predict accurately.

