In [1]:
# Import libraries
import pandas as pd
from faker import Faker
import random
from faker.providers import BaseProvider

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Dictionary mapping city names to abbreviations
city_abbreviations = {'Bangalore': 'BLR', 'Delhi': 'DEL', 'New Delhi': 'NDEL', 'Kolkata': 'CCU', 'Bhubaneswar': 'IXR', 'Bhubaneshwar': 'BBI', 'Lucknow': 'LKO', 'Mumbai': 'BOM', 'Kochi': 'COK', 'Chennai': 'MAA', 'Pune': 'PNQ', 'Indore': 'IDR', 'Guwahati': 'GAU', 'Nagpur': 'NAG', 'Hyderabad': 'HYD', 'Jaipur': 'JAI', 'Amritsar': 'ATQ', 'Jodhpur': 'JDH', 'Goa': 'GOI', 'Vadodara': 'BDQ', 'Thiruvananthapuram': 'TRV', 'Aurangabad': 'IXU', 'Bengaluru': 'IXB', 'Ahmedabad': 'AMD', 'Udaipur': 'UDR', 'Dehradun': 'DED', 'Bhopal': 'BHO', 'Vijayawada': 'VGA', 'Varanasi': 'VNS', 'Thrissur': 'TCR', 'Chandigarh': 'IXC', 'Patna': 'PAT', 'Jabalpur': 'JLR', 'Kanpur': 'KNU', 'Gwalior': 'GWL', 'Raipur': 'RPR', 'Vishakhapatnam': 'VTZ', 'Srinagar': 'IXS', 'Hubli': 'HBX', 'Imphal': 'IMF', 'Surat': 'STV'}

In [12]:
# Create function for each data feature to use in generating fake data
class FlightProvider(BaseProvider):
    def airline(self):
        # Generate a random airline from a predefined list
        airlines = ['Air India', 'GoAir', 'IndiGo', 'Jet Airways', 'SpiceJet']
        return self.random_element(airlines)
    
    def journey_date(self):
        # Generate a random journey date (day, month, year)
        return (self.random_int(1, 31), self.random_int(1, 12), self.random_int(2020, 2022))
    
    def source_city(self):
        # Generate a random source city from a predefined list
        cities = ['Banglore', 'Delhi', 'Mumbai', 'Kolkata', 'Chennai', ]
        return self.random_element(cities)
    
    def destination(self):
        # Generate a random destination city
        cities = ['Banglore', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata', 'New Delhi']
        return self.random_element(cities)
    
    def route(self):
        # Generate a random route (list of cities)
        cities = ['BLR', 'DEL', 'NDEL', 'CCU', 'IXR', 'BBI', 'LKO', 'BOM', 'COK', 'MAA', 'PNQ', 'IDR', 'GAU', 'NAG', 'HYD', 'JAI', 'ATQ', 'JDH', 'GOI', 'BDQ', 'TRV', 'IXU', 'IXB', 'AMD', 'UDR', 'DED', 'BHO', 'VGA', 'VNS', 'TCR', 'IXC', 'PAT', 'JLR', 'KNU', 'GWL', 'RPR', 'VTZ', 'IXS', 'HBX', 'IMF', 'STV']
        # Add the source and destination cities to the route
        route = [self.source_city, self.destination]
        # Generate a random number of stops (between 2 and 4)
        num_stops = self.random_int(2, 4)
        # Add the stops to the route
        for i in range(num_stops):
            stop = self.random_element(cities)
            # Make sure the stop is not the source or destination
            while stop == self.source_city or stop == self.destination:
                stop = self.random_element(cities)
                route.insert(i+1, stop)
        return self.random_elements(route)
                
    def departure_time(self):
        # Generate a random departure time (hour and minute)
        return (self.random_int(0, 23), self.random_int(0, 59))

    def arrival_time(self):
        # Generate a random arrival time (hour and minute)
        return (self.random_int(0, 23), self.random_int(0, 59))

    def duration(self):
        # Generate a random duration (hours and minutes)
        return (self.random_int(0, 23), self.random_int(0, 59))

    def total_stops(self):
        return self.len(self.route) - 1
    
    def flight_price(self):
        # Generate a random flight price
        return self.random_int(1700, 80000)
    

In [None]:
city_abbreviations = {'Bangalore': 'BLR', 'Delhi': 'DEL', 'New Delhi': 'NDEL', 'Kolkata': 'CCU', 'Bhubaneswar': 'IXR', 'Bhubaneshwar': 'BBI', 'Lucknow': 'LKO', 'Mumbai': 'BOM', 'Kochi': 'COK', 'Chennai': 'MAA', 'Pune': 'PNQ', 'Indore': 'IDR', 'Guwahati': 'GAU', 'Nagpur': 'NAG', 'Hyderabad': 'HYD', 'Jaipur': 'JAI', 'Amritsar': 'ATQ', 'Jodhpur': 'JDH', 'Goa': 'GOI', 'Vadodara': 'BDQ', 'Thiruvananthapuram': 'TRV', 'Aurangabad': 'IXU', 'Bengaluru': 'IXB', 'Ahmedabad': 'AMD', 'Udaipur': 'UDR', 'Dehradun': 'DED', 'Bhopal': 'BHO', 'Vijayawada': 'VGA', 'Varanasi': 'VNS', 'Thrissur': 'TCR', 'Chandigarh': 'IXC', 'Patna': 'PAT', 'Jabalpur': 'JLR', 'Kanpur': 'KNU', 'Gwalior': 'GWL', 'Raipur': 'RPR', 'Vishakhapatnam': 'VTZ', 'Srinagar': 'IXS', 'Hubli': 'HBX', 'Imphal': 'IMF', 'Surat': 'STV'}

fp = FlightProvider()

In [9]:
# Initialize generator
fake = Faker()
fake.add_provider(FlightProvider)

In [None]:
# # import random

# # Generate a list of random city names
# cities = [fake.city() for _ in range(10)]

# # Create a mapping between the city names and their abbreviations
# abbreviations = {}
# for city in cities:
#     abbreviation = ''.join([word[:3] for word in city.split()])
#     abbreviations[city] = abbreviation

# # Generate fake flight data
# routes = []
# for _ in range(1000):
#     source_city = random.choice(cities)
#     destination_city = random.choice(cities)
#     route = f'{abbreviations[source_city]}-{abbreviations[destination_city]}'
#     routes.append(route)

# print(routes)

In [15]:
# Create an empty dictionary to hold the generated flight data
flight_data = {}

# Generate flight data 
for i in range(2000):
    airline = fake.airline()
    journey_date = fake.journey_date()
    source_city = fake.source_city()
    destination = fake.destination()
    route = fake.route()
    departure_time = fake.departure_time()
    arrival_time = fake.arrival_time()
    duration = fake.duration()
    total_stops = fake.total_stops()
    flight_price = fake.flight_price()

    flight_data[i] = {
        'airline': airline,
        'journey_date': journey_date,
        'source_city': source_city,
        'destination': destination,
        'route': route,
        'departure_time': departure_time,
        'arrival_time': arrival_time,
        'duration': duration,
        'total_stops': total_stops,
        'flight_price': flight_price
    } 

TypeError: object of type 'method' has no len()

In [None]:
# View the first key-value pair in the flight_data dictionary
for key, value in flight_data.items():
    if key == 0:
        print(key, value)

We will need to do a bit of preprocessing on the data before it will be ready to use in the model.

In [None]:
# Create dataframe of flight data
flight_df = pd.DataFrame.from_dict(flight_data, orient='index')

In [None]:
# View the data
flight_df.head()

In [None]:
# Calculate the total number of stops for each flight
total_stops = fp.calculate_total_stops()

# Add the total number of stops to the dataframe
flight_df['total_stops'] = total_stops

In [None]:
# Create new variables to hold the days and months 
days = flight_df['journey_date'].apply(lambda x: x[0])
months = flight_df['journey_date'].apply(lambda x: x[1])

In [None]:
# Add new columns to dataframe
flight_df.insert(len(flight_df.columns), 'journey_day', days)
flight_df.insert(len(flight_df.columns), 'journey_month', months)

In [None]:
flight_df.head()

In [None]:
# Drop the journey_date column
flight_df = flight_df.drop('journey_date', axis=1)

In [None]:
flight_df.head()

In [None]:
# Extract hours
def extract_hours(col):
    flight_df[col + '_hour'] = flight_df[col].apply(lambda x: x[0])

# Extract minutes
def extract_minutes(col):
    flight_df[col + '_minute'] = flight_df[col].apply(lambda x: x[1])

# Drops the column
def drop_column(col):
    flight_df.drop(col, axis=1, inplace=True)


# hours = flight_df['departure_time'].apply(lambda x: x[0])
# minutes = flight_df['departure_time'].apply(lambda x: x[1])

In [None]:
# Call functions for departure_time
extract_hours('departure_time')
extract_minutes('departure_time')
drop_column('departure_time')

In [None]:
# Call function for arrival_time
extract_hours('arrival_time')
extract_minutes('arrival_time')
drop_column('arrival_time')

In [None]:
# Call function for duration
extract_hours('duration')
extract_minutes('duration')
drop_column('duration')

In [None]:
flight_df.head()

In [None]:
flight_df['airline'].value_counts()

In [None]:
value_counts = flight_df['route'].value_counts()
unique_values = value_counts.index
print(unique_values)

In [None]:
# Create a dictionary mapping city names to abbreviations
city_abbreviations = {
    'BLR': 'Banglore',
    'DEL': 'Delhi',
    'NDEL': 'New Delhi',
    'CCU': 'Kolkata',
    'IXR': 'Ranchi',
    'BBI': 'Bhubaneswar',
    'LKO': 'Lucknow',
    'BOM': 'Mombai',
    'COK': 'Cochin',
    'MAA': 'Chennai',
    'PNQ': 'Pune',
    'IDR': 'Indore',
    'GAU': 'Guwahati',
    'NAG': 'Nagpour',
    'HYD': 'Hyderabad',
    'JAI': 'Jaipur',
    'ATQ': 'Amritsar',
    'JDH': 'Jodhpur',
    'GOI': 'Goa',
    'BDQ': 'Vadodara',
    'TRV': 'Thiruvananthapuram',
    'IXU': 'Aurangabad',
    'IXB': 'Bagdogra',
    'AMD': 'Ahmedabad',
    'UDR': 'Udaipur',
    'DED': 'Dehradun',
    'BHO': 'Bhopal',
    'VGA': 'Vijayawada',
    'VNS': 'Varanasi',
    'TCR': 'Tuticorin',
    'IXC': 'Chandigarh',
    'PAT': 'Patna',
    'JLR': 'Jabalpur',
    'KNU': 'Kanpur',
    'GWL': 'Gwalior',
    'RPR': 'Raipur',
    'VTZ': 'Visakhapatnam',
    'IXS': 'Silchar',
    'HBX': 'Hubli',
    'IMF': 'Imphal',
    'STV': 'Surat'
}

# Split the routes column into a list of routes
routes_list = flight_df['routes'].str.split(',')

# Create a list of abbreviated routes
abbreviated_routes = []
for routes in routes_list:
    # Concatenate the abbreviations for the source and destination cities
    abbreviated_route = city_abbreviations[routes[0]] + '-' + city_abbreviations[routes[-1]]
    abbreviated_routes.append(abbreviated_route)

In [None]:
route = flight_df['route'].apply(lambda x: pd.Series(x).value_counts())
route.head()

In [None]:
route.fillna('None', axis=1)

In [None]:
destination

---

In [None]:
# Load test data
test_df = pd.read_csv('/Users/tiffanivick/Workspace/Travelers_Analysis/Test_set.csv')

In [None]:
# View the data
test_df.head()

In [None]:
# Format the column titles to lowercase
test_df = test_df.rename(columns=lambda x: x.lower())

# View the updated dataframe
test_df.head()

## Preprocess data

In [None]:
test_df.dtypes

In [None]:
def convert_to_datetime(col):
    """ 
    Converts the column's datatype to datetime
    """
    test_df[col] = pd.to_datetime(test_df[col])

In [None]:
test_df.columns

In [None]:
# Convert columns datatypes to datetime
for i in ['date_of_journey', 'dep_time', 'arrival_time']:
    convert_to_datetime(i)

In [None]:
test_df.dtypes

In [None]:
# Extract the day from the date_of_journey column
test_df['journey_day'] = test_df['date_of_journey'].dt.day

# Extract the month from the date_of_journey column
test_df['journey_month'] = test_df['date_of_journey'].dt.month

In [None]:
test_df.head()

In [None]:
# Drop the date_of_journey column from the dataset
test_df.drop('date_of_journey', axis=1, inplace=True)

In [None]:
def extract_hour(df, col):
    """ 
    Extracts the hour from the datetime column
    """
    df[col + '_hour'] = df[col].dt.hour
    
def extract_min(df, col):
    """ 
    Extracts the minute from the datetime column
    """
    df[col + '_minute'] = df[col].dt.minute    
    
def drop_col(df, col):
    """ 
    Drops the column from the dataframe
    """
    df.drop(col, axis=1, inplace=True)

In [None]:
# Call functions for the dep_time column
extract_hour(test_df, 'dep_time')
extract_min(test_df, 'dep_time')
drop_col(test_df, 'dep_time')

In [None]:
# Call functions for the arrival_time column
extract_hour(test_df, 'arrival_time')
extract_min(test_df, 'arrival_time')
drop_col(test_df, 'arrival_time')

In [None]:
test_df.head()

In [None]:
dur_list = list(test_df['duration'])

for i in range(len(dur_list)):
    if len(dur_list[i].split(' ')) == 2:
        pass
    else:
        # Checks if duration contains only the hour
        if 'h' in dur_list[i]:
            # Adds 0 to minutes
            dur_list[i] = dur_list[i] + ' 0m' 
        else:
            dur_list[i] = '0h ' + dur_list[i]

In [None]:
test_df['duration'] = dur_list

In [None]:
test_df.head()

In [None]:
def hour(x):
    """ 
    Returns the hour part
    """
    return x.split(' ')[0][0: -1]

def minutes(x):
    """ 
    Returns the minute part
    """
    return x.split(' ')[1][0: 2].strip('m')

In [None]:
test_df['dur_hour'] = test_df['duration'].apply(hour)

In [None]:
test_df['dur_min'] = test_df['duration'].apply(minutes)

In [None]:
test_df.head()

In [None]:
drop_col(test_df, 'duration')

In [None]:
test_df.dtypes

In [None]:
test_df['dur_hour'] = test_df['dur_hour'].astype(int)
test_df['dur_min'] = test_df['dur_min'].astype(int)

In [None]:
test_df.dtypes

In [None]:
# Create a list to hold the categorical columns
column = [column for column in test_df.columns if test_df[column].dtype == 'object']
print(column)

In [None]:
# Create a list to hold the continuous columns
continuous_col = [column for column in test_df.columns if test_df[column].dtype != 'object']
print(continuous_col)

In [None]:
# Create a dataframe to hold the categorical data 
categorical = test_df[column]

In [None]:
categorical.head()

In [None]:
# Perform OneHotEncoding on airline data
airline = pd.get_dummies(categorical['airline'], drop_first=True)

In [None]:
airline.head()

In [None]:
# Perform OneHotEncoding on source column
source = pd.get_dummies(categorical['source'], drop_first=True)

In [None]:
source.head()

In [None]:
# Perform OneHotEncoding on destination column
destination = pd.get_dummies(categorical['destination'], drop_first=True)

In [None]:
destination.head()

In [None]:
# Separate the routes in the route column
categorical['route1']=categorical['route'].str.split('→').str[0]
categorical['route2']=categorical['route'].str.split('→').str[1]
categorical['route3']=categorical['route'].str.split('→').str[2]
categorical['route4']=categorical['route'].str.split('→').str[3]
categorical['route5']=categorical['route'].str.split('→').str[4]

In [None]:
categorical.head()

In [None]:
drop_col(categorical, 'route')

In [None]:
categorical.isnull().sum()

In [None]:
for i in ['route3', 'route4', 'route5']:
    categorical[i].fillna('None', inplace=True)

In [None]:
categorical.isnull().sum()

In [None]:
# apply LabelEncoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
for i in ['route1', 'route2', 'route3', 'route4', 'route5']:
    categorical[i] = encoder.fit_transform(categorical[i]) 

In [None]:
categorical.head()

In [None]:
drop_col(categorical, 'additional_info')

In [None]:
categorical['total_stops'].unique()

In [None]:
# Encode total stops
dict = {'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4}
categorical['total_stops'] = categorical['total_stops'].map(dict)

In [None]:
categorical['total_stops']

In [None]:
drop_col(categorical,'source')
drop_col(categorical,'destination')
drop_col(categorical,'airline')

In [None]:
final_test_df = pd.concat([categorical, airline, source, destination, test_df[continuous_col]], axis=1)

In [None]:
final_test_df.head()

In [None]:
pd.set_option('display.max_columns',33)
final_test_df.head()

In [None]:
test_df.shape

#### Test model with new dataset

In [None]:
X_test = final_test_df

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [None]:
import pickle

# Load the model
with open('./pretrained_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
predictions = model.predict(X_test)

---

In [None]:
# flights = [
#     {'total_stops': 2, 'routes': ['New York', 'Chicago', 'Los Angeles'], 'destination': 'Los Angeles', 'airline': 'United', 'journey_date': '2022-01-01', 'departure_time': '09:00', 'arrival_time': '12:30', 'duration': '03:30', 'price': 500},
#     {'total_stops': 1, 'routes': ['Chicago', 'New York'], 'destination': 'New York', 'airline': 'Delta', 'journey_date': '2022-01-03', 'departure_time': '14:00', 'arrival_time': '17:30', 'duration': '03:30', 'price': 400},
#     {'total_stops': 0, 'routes': ['Los Angeles', 'Chicago'], 'destination': 'Chicago', 'airline': 'American', 'journey_date': '2022-01-05', 'departure_time': '10:00', 'arrival_time': '12:00', 'duration': '02:00', 'price': 300},
#     {'total_stops': 1, 'routes': ['New York', 'Chicago'], 'destination': 'Chicago', 'airline': 'United', 'journey_date': '2022-01-07', 'departure_time': '08:00', 'arrival_time': '10:30', 'duration': '02:30', 'price': 350},
#     {'total_stops': 0, 'routes': ['Chicago', 'Los Angeles'], 'destination': 'Los Angeles', 'airline': 'Delta', 'journey_date': '2022-01-09', 'departure_time': '12:00', 'arrival_time': '15:00', 'duration': '03:00', 'price': 450},
# ]