#Airline Fare Case Study

## Data Visualization

---



---


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
csv_path = "https://raw.githubusercontent.com/nidhi440/FlightfareData/main/Clean_Dataset.csv"
df=pd.DataFrame([])
df=pd.read_csv(csv_path)
df.tail()

In [None]:
df1=pd.DataFrame([])
df1=df.groupby(['flight','airline'],as_index=False).count()
df1.airline.value_counts()

In [None]:
plt.figure(figsize=(7,4))
sns.countplot(df1['airline'],palette='pastel')
plt.title('Flights Count for Different Airlines',fontsize=12)
plt.xlabel('Airline',fontsize=12)
plt.ylabel('Count',fontsize=12)
plt.show()

In [None]:
df2=df.groupby(['flight','airline','class'],as_index=False).count()
df2['class'].value_counts()

In [None]:
plt.figure(figsize=(7,5))
plt.pie(df2['class'].value_counts(),autopct='%.2f%%')
plt.axis('equal')
plt.title('Classes of Different Airlines',fontsize=15)
plt.legend(['Economy','Business'])
plt.show()

In [None]:
sns.set_style('whitegrid')
df3=df.groupby('class')
df4 = df3.get_group('Economy')
sns.distplot(df4['price'], kde = False, color ='red', bins = 30);

In [None]:
sns.set_style('whitegrid')
df3=df.groupby('class')
df4 = df3.get_group('Business')
sns.distplot(df4['price'], kde = False, color ='blue', bins = 30);

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='departure_time',y='price',data=df)
plt.title('Departure Time Vs Ticket Price',fontsize=20)
plt.xlabel('Departure Time',fontsize=15)
plt.ylabel('Price',fontsize=15)

In [None]:
plt.figure(figsize=(13,6))
sns.lineplot(data=df,x='days_left',y='price',color='blue')
plt.title('Days Left For Departure Versus Ticket Price',fontsize=20)
plt.xlabel('Days Left for Departure',fontsize=15)
plt.ylabel('Price',fontsize=15)
plt.show()


## GPU Setup

---



---


In [None]:
!nvidia-smi

## Deep Learning / Neural Network

---



---



###View the data again

In [None]:
csv_path = "https://raw.githubusercontent.com/nidhi440/FlightfareData/main/Clean_Dataset.csv"
df=pd.DataFrame([])
df=pd.read_csv(csv_path)
df.sample(30)

### Clean up the data for better performance

In [None]:
# convert column "stops" into numeric
def mapper(x): 
    if(x=='zero'):
        return 0
    elif(x=='one'):
        return 1
    else:
        return 2

df['stops']=df['stops'].apply(mapper)

df[['stops']].value_counts()

In [None]:
# Convert duration (hr format) into min format
df['duration']=df['duration'].apply(lambda x: int(round(x*60)))

df['duration'].sample(5)

In [None]:
# other columns that need encode:
for col in df.select_dtypes(include='object').columns:
    print(f"{col:25} ---> {df[col].nunique():15} unique values")

In [None]:
df=df.drop(['Unnamed: 0','flight'],axis=1)
df.sample(10)

In [None]:
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer, make_column_selector

In [None]:
oh = OneHotEncoder(drop='first',handle_unknown='ignore')
# example: airline_Vistara, airline_Indigo, airline_AirAsia
#          [      0       ,        1      ,        0      ]

ct = ColumnTransformer([
    ("cat_encoder",oh,make_column_selector(dtype_include='object'))
],remainder='passthrough')

### Create train set & test set

In [None]:
# X - independent features(excluding target variable)
X = df.drop(["price"], axis = 1)

# y - dependent variables, called (target)
y = df[['price']]

In [None]:
from sklearn.model_selection import train_test_split

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=10)

In [None]:
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
X_test[0].toarray(), y_test.head(1).values

### Set up data in batches

In [None]:
import tensorflow as tf

batch_size=256 # batch size to be fed

def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

X_train=convert_sparse_matrix_to_sparse_tensor(X_train)
X_test=convert_sparse_matrix_to_sparse_tensor(X_test)

# train data
data_tf_tr=tf.data.Dataset.from_tensor_slices((X_train, y_train))
data_tr_batches = data_tf_tr.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

# test data
data_tf_te=tf.data.Dataset.from_tensor_slices((X_test, y_test))
data_te_batches = data_tf_te.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

#test result data
data_tf_tre=tf.data.Dataset.from_tensor_slices(X_test)
data_tre_batches = data_tf_tre.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

### Create Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Input
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
X_train.shape[1]

In [None]:
model = Sequential(); # we also have Functional API and multi-inputs multi-outputs API

#Input Layer
model.add(Input(shape=(X_train.shape[1],), sparse=True))

#Hidden Layer
for counter in range(1,9):
    model.add(Dense(8*X_train.shape[1],activation='relu'))
    if(counter%4==0):
        model.add(Dropout(0.75))

#Output Layer
model.add(Dense(1))

model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics = 'mean_absolute_error');

### Try your own Neural Network

### Train the model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

e=EarlyStopping(patience=5,restore_best_weights=True,verbose=1)

In [None]:
%%time
history=model.fit(data_tr_batches,epochs=60,callbacks=[e],verbose=1)

### Visualize training outcomes

In [None]:
d1=pd.DataFrame(history.history)
d1.plot(figsize=(8,8))

### Test the model using test set

In [None]:
model.evaluate(data_te_batches)

In [None]:
pred1=model.predict(X_test)

# visualize the differentbetween actual price and predicted price
res=pd.DataFrame({"pred":pred1.ravel(),"y_test":y_test.values.ravel(),"diff":abs(pred1.ravel()-y_test.values.ravel())})

plt.figure(figsize=(25,8));
sns.lineplot(data=res['diff']);