# in this dataset I will be combining all of my data to get ready for EDA & pre-processing

In [2]:
import pandas as pd

### Read in 3 datasets

In [3]:
flights = pd.read_csv('./data/clean/flightdata_city_bymonth.csv')
flights.head()

Unnamed: 0,year-month,market_city,Passengers,Seats,Flights,Distance,Origin Population,Destination Population
0,1990-01-31,"Abilene, TX - Dallas, TX",741,1018,7,158.0,147700,8019250
1,1990-01-31,"Akron, OH - Atlanta, GA",3742,5610,56,528.0,658558,3087755
2,1990-01-31,"Akron, OH - Birmingham, AL",75,99,1,585.0,658558,958585
3,1990-01-31,"Akron, OH - Chicago, IL",7863,20688,170,344.0,658558,16395048
4,1990-01-31,"Akron, OH - Cleveland, OH",0,123,1,40.0,658558,2103367


In [4]:
fuel = pd.read_csv('./data/clean/fuelpricing_bymonth.csv')
fuel.head()

Unnamed: 0,year-month,fuel_usd_pergallon
0,1990-04-30,0.54
1,1990-05-31,0.515
2,1990-06-30,0.494
3,1990-07-31,0.535
4,1990-08-31,0.791


In [5]:
airfare = pd.read_csv('./data/clean/routepricing_byquarter.csv')
airfare.head()

Unnamed: 0,year-month,market_city,city1,city2,fare
0,1996-03-31,"Hartford, CT - West Palm Beach/Palm Beach, FL","Hartford, CT","West Palm Beach/Palm Beach, FL",129.2
1,1996-03-31,"Minneapolis/St. Paul, MN - San Francisco, CA (...","Minneapolis/St. Paul, MN","San Francisco, CA (Metropolitan Area)",290.73
2,1996-03-31,"Cincinnati, OH - Tampa, FL (Metropolitan Area)","Cincinnati, OH","Tampa, FL (Metropolitan Area)",153.17
3,1996-03-31,"Denver, CO - Portland, OR","Denver, CO","Portland, OR",240.01
4,1996-03-31,"Los Angeles, CA (Metropolitan Area) - Phoenix, AZ","Los Angeles, CA (Metropolitan Area)","Phoenix, AZ",73.67


### Set datetime as index on all dataframes

In [6]:
flights = flights.set_index('year-month').sort_index()
fuel = fuel.set_index('year-month').sort_index()
airfare = airfare.set_index('year-month').sort_index()

### market_city must match on flights & airfare df's

In [7]:
print(f'flights shape: {flights.shape}')
print(f'fuel shape: {fuel.shape}')
print(f'airfare shape: {airfare.shape}')

flights shape: (1051957, 7)
fuel shape: (365, 1)
airfare shape: (95023, 4)


In [8]:
# How many unique routes do we have pricing data for?
print(f'# of Routes with Pricing Data: {len(airfare.market_city.unique())}')

# of Routes with Pricing Data: 1629


In [9]:
# Identify unique markets to potentially model

# Create dataframe out of value_counts values
price_count = pd.DataFrame(airfare.market_city.value_counts()).reset_index()

# Filter dataframe to include only those with 95 - this means all data available for every quarter then save as list
price_count = list(sorted(list(price_count[price_count['market_city'] == 95]['index'])))
price_count[:10]

['Albany, NY - Chicago, IL',
 'Albany, NY - Orlando, FL',
 'Albany, NY - Washington, DC (Metropolitan Area)',
 'Albuquerque, NM - Chicago, IL',
 'Albuquerque, NM - Dallas/Fort Worth, TX',
 'Albuquerque, NM - Denver, CO',
 'Albuquerque, NM - Houston, TX',
 'Albuquerque, NM - Las Vegas, NV',
 'Albuquerque, NM - Los Angeles, CA (Metropolitan Area)',
 'Albuquerque, NM - New York City, NY (Metropolitan Area)']

In [10]:
airfare = airfare.loc[airfare['market_city'].isin(price_count)]
airfare.head()

Unnamed: 0_level_0,market_city,city1,city2,fare
year-month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996-03-31,"Minneapolis/St. Paul, MN - San Francisco, CA (...","Minneapolis/St. Paul, MN","San Francisco, CA (Metropolitan Area)",290.73
1996-03-31,"Cincinnati, OH - Tampa, FL (Metropolitan Area)","Cincinnati, OH","Tampa, FL (Metropolitan Area)",153.17
1996-03-31,"Denver, CO - Portland, OR","Denver, CO","Portland, OR",240.01
1996-03-31,"Los Angeles, CA (Metropolitan Area) - Phoenix, AZ","Los Angeles, CA (Metropolitan Area)","Phoenix, AZ",73.67
1996-03-31,"Atlantic City, NJ - Miami, FL (Metropolitan Area)","Atlantic City, NJ","Miami, FL (Metropolitan Area)",96.28


In [11]:
# Unique routes we have full airfare/pricing data for
len(airfare.market_city.unique())

632

In [12]:
# Unique routes we have data for
len(sorted(flights.market_city.unique()))

30331

In [13]:
flights['city1'] = flights['market_city'].apply(lambda x: x.split(' - ')[0])
flights['city2'] = flights['market_city'].apply(lambda x: x.split(' - ')[1])

In [14]:
print(flights.shape)
flights.head()

(1051957, 9)


Unnamed: 0_level_0,market_city,Passengers,Seats,Flights,Distance,Origin Population,Destination Population,city1,city2
year-month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990-01-31,"Abilene, TX - Dallas, TX",741,1018,7,158.0,147700,8019250,"Abilene, TX","Dallas, TX"
1990-01-31,"Akron, OH - Atlanta, GA",3742,5610,56,528.0,658558,3087755,"Akron, OH","Atlanta, GA"
1990-01-31,"Akron, OH - Birmingham, AL",75,99,1,585.0,658558,958585,"Akron, OH","Birmingham, AL"
1990-01-31,"Akron, OH - Chicago, IL",7863,20688,170,344.0,658558,16395048,"Akron, OH","Chicago, IL"
1990-01-31,"Akron, OH - Cleveland, OH",0,123,1,40.0,658558,2103367,"Akron, OH","Cleveland, OH"


In [15]:
airfare['city1'] = [i.replace(' (Metropolitan Area)', "") for i in airfare['city1']]
airfare['city2'] = [i.replace(' (Metropolitan Area)', "") for i in airfare['city2']]

airfare['city1'] = [i.replace('Dallas/Fort Worth, TX', "Dallas, TX") for i in airfare['city1']]
airfare['city2'] = [i.replace('Dallas/Fort Worth, TX', "Dallas, TX") for i in airfare['city2']]

airfare['city1'] = [i.replace('Greensboro/High Point, NC', "Greensboro, NC") for i in airfare['city1']]
airfare['city2'] = [i.replace('Greensboro/High Point, NC', "Greensboro, NC") for i in airfare['city2']]

airfare['city1'] = [i.replace('Minneapolis/St. Paul, MN', "Minneapolis, MN") for i in airfare['city1']]
airfare['city2'] = [i.replace('Minneapolis/St. Paul, MN', "Minneapolis, MN") for i in airfare['city2']]

airfare['city1'] = [i.replace('New York City, NY', "New York, NY") for i in airfare['city1']]
airfare['city2'] = [i.replace('New York City, NY', "New York, NY") for i in airfare['city2']]

airfare['city1'] = [i.replace('Raleigh/Durham, NC', "Raleigh, NC") for i in airfare['city1']]
airfare['city2'] = [i.replace('Raleigh/Durham, NC', "Raleigh, NC") for i in airfare['city2']]

In [16]:
flights_city_names = []
for i in flights.city1.unique():
    flights_city_names.append(i)
for i in flights.city2.unique():
    flights_city_names.append(i)
flights_city_names = sorted(list(set(flights_city_names)))
print(len(flights_city_names))
flights_city_names[:10]

563


['Aberdeen, SD',
 'Abilene, TX',
 'Akron, OH',
 'Alamogordo, NM',
 'Albany, GA',
 'Albany, NY',
 'Albany, OR',
 'Albuquerque, NM',
 'Alexandria, LA',
 'Alexandria, MN']

In [17]:
airfare_city_names = []
for i in airfare.city1.unique():
    airfare_city_names.append(i)
for i in airfare.city2.unique():
    airfare_city_names.append(i)
airfare_city_names = sorted(list(set(airfare_city_names)))
print(len(airfare_city_names))
airfare_city_names[:10]

73


['Albany, NY',
 'Albuquerque, NM',
 'Amarillo, TX',
 'Atlanta, GA',
 'Atlantic City, NJ',
 'Austin, TX',
 'Birmingham, AL',
 'Boise, ID',
 'Boston, MA',
 'Buffalo, NY']

In [18]:
in_airfare_and_flight = []
not_in_airfare_and_flight = []
for i in flights_city_names:
    if i in airfare_city_names:
        in_airfare_and_flight.append(i)
    else:
        not_in_airfare_and_flight.append(i)
        

In [19]:
# 65 of 73 airfare city names match city names in flights city names
len(in_airfare_and_flight)

65

In [20]:
# These are the remaining names that are in airfare that do match the flight city names
remaining = list(set(airfare_city_names) - set(in_airfare_and_flight))
set(airfare_city_names) - set(in_airfare_and_flight)

{'Boise, ID',
 'Denver, CO',
 'Fort Myers, FL',
 'Louisville, KY',
 'Midland/Odessa, TX',
 'Norfolk, VA',
 'Sarasota/Bradenton, FL',
 'West Palm Beach/Palm Beach, FL'}

In [21]:
# remaining = [i.split(', ')[0] for i in remaining]
# remaining = sorted(remaining)
for i in sorted(remaining):
    print(i.replace(' (Metropolitan Area)', ""))

Boise, ID
Denver, CO
Fort Myers, FL
Louisville, KY
Midland/Odessa, TX
Norfolk, VA
Sarasota/Bradenton, FL
West Palm Beach/Palm Beach, FL


In [22]:
airfare['market_city'] = airfare['city1'] + ' - ' + airfare['city2']
airfare['market_city']

year-month
1996-03-31    Minneapolis, MN - San Francisco, CA
1996-03-31             Cincinnati, OH - Tampa, FL
1996-03-31              Denver, CO - Portland, OR
1996-03-31          Los Angeles, CA - Phoenix, AZ
1996-03-31          Atlantic City, NJ - Miami, FL
                             ...                 
2019-09-30            Atlanta, GA - Milwaukee, WI
2019-09-30         New Orleans, LA - New York, NY
2019-09-30          Detroit, MI - Los Angeles, CA
2019-09-30             Chicago, IL - Portland, OR
2019-09-30      Portland, OR - Salt Lake City, UT
Name: market_city, Length: 60040, dtype: object

In [23]:
airfare = airfare.reset_index()
airfare['year-month'] = pd.to_datetime(airfare['year-month'])
airfare = airfare.set_index('year-month')

In [24]:
flights = flights.reset_index()
flights['year-month'] = pd.to_datetime(flights['year-month'])
flights = flights.set_index('year-month')

In [25]:
flights.shape

(1051957, 9)

In [26]:
airfare.shape

(60040, 4)

In [27]:
airfare['year'] = airfare.reset_index()['year-month'].dt.year
airfare['quarter'] = (airfare.reset_index()['year-month'].dt.month - 1) / 3

In [28]:
airfare = airfare.reset_index()
airfare['year-month'] = pd.to_datetime(airfare['year-month'])
airfare['year'] = airfare['year-month'].dt.year
airfare['month'] = airfare['year-month'].dt.month
airfare['quarter'] = airfare['month'].apply(lambda x: (x - 1) // 3 + 1)
airfare[airfare['year-month'] == '1996-12-31']

Unnamed: 0,year-month,market_city,city1,city2,fare,year,quarter,month
1896,1996-12-31,"Chicago, IL - Tampa, FL","Chicago, IL","Tampa, FL",137.32,1996,4,12
1897,1996-12-31,"Hartford, CT - Los Angeles, CA","Hartford, CT","Los Angeles, CA",307.27,1996,4,12
1898,1996-12-31,"Pittsburgh, PA - San Francisco, CA","Pittsburgh, PA","San Francisco, CA",314.86,1996,4,12
1899,1996-12-31,"Atlanta, GA - Dallas, TX","Atlanta, GA","Dallas, TX",208.46,1996,4,12
1900,1996-12-31,"Boston, MA - New Orleans, LA","Boston, MA","New Orleans, LA",189.05,1996,4,12
...,...,...,...,...,...,...,...,...
2523,1996-12-31,"Chicago, IL - Phoenix, AZ","Chicago, IL","Phoenix, AZ",139.98,1996,4,12
2524,1996-12-31,"San Francisco, CA - St. Louis, MO","San Francisco, CA","St. Louis, MO",195.88,1996,4,12
2525,1996-12-31,"Norfolk, VA - San Diego, CA","Norfolk, VA","San Diego, CA",269.58,1996,4,12
2526,1996-12-31,"Kansas City, MO - Tampa, FL","Kansas City, MO","Tampa, FL",136.68,1996,4,12


In [29]:
flights

Unnamed: 0_level_0,market_city,Passengers,Seats,Flights,Distance,Origin Population,Destination Population,city1,city2
year-month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990-01-31,"Abilene, TX - Dallas, TX",741,1018,7,158.0,147700,8019250,"Abilene, TX","Dallas, TX"
1990-01-31,"Akron, OH - Atlanta, GA",3742,5610,56,528.0,658558,3087755,"Akron, OH","Atlanta, GA"
1990-01-31,"Akron, OH - Birmingham, AL",75,99,1,585.0,658558,958585,"Akron, OH","Birmingham, AL"
1990-01-31,"Akron, OH - Chicago, IL",7863,20688,170,344.0,658558,16395048,"Akron, OH","Chicago, IL"
1990-01-31,"Akron, OH - Cleveland, OH",0,123,1,40.0,658558,2103367,"Akron, OH","Cleveland, OH"
...,...,...,...,...,...,...,...,...,...
2009-12-31,"Youngstown, OH - Orlando, FL",1085,1200,8,861.0,562963,2082421,"Youngstown, OH","Orlando, FL"
2009-12-31,"Yuma, AZ - El Centro, CA",119,870,29,58.0,196972,166874,"Yuma, AZ","El Centro, CA"
2009-12-31,"Yuma, AZ - Gulfport, MS",129,130,1,1511.0,196972,238772,"Yuma, AZ","Gulfport, MS"
2009-12-31,"Yuma, AZ - Los Angeles, CA",2297,3570,119,237.0,196972,25749594,"Yuma, AZ","Los Angeles, CA"


In [30]:
flights = flights.reset_index()
flights['year-month'] = pd.to_datetime(flights['year-month'])
flights['year'] = flights['year-month'].dt.year
flights['month'] = flights['year-month'].dt.month
flights['quarter'] = flights['month'].apply(lambda x: (x - 1) // 3 + 1)

In [31]:
# pick up from here -- GET THE TWO DATAFRAMES TO MERGE & PRESERVE THE 95 QUARTERS OF DATA

# HOW MANY MONTHS OF DATA DOES EACH ROUTE HAVE IN THE flights DF -- maybe this is why the merge goes down weird

In [32]:
did_it_work = pd.merge(flights, airfare, how='left', left_on=['year', 'quarter', 'market_city'], right_on=['year', 'quarter', 'market_city']).dropna()
len(did_it_work['year-month_y'].unique())

56

In [33]:
flights['year-month'].max()

Timestamp('2009-12-31 00:00:00')

In [34]:
flights['year-month'].min()

Timestamp('1990-01-31 00:00:00')

In [35]:
len(sorted(flights['year-month'].unique())) # 240 months 20 years of data from 1990 to 2009 

240

In [36]:
flights.head(3)

Unnamed: 0,year-month,market_city,Passengers,Seats,Flights,Distance,Origin Population,Destination Population,city1,city2,year,month,quarter
0,1990-01-31,"Abilene, TX - Dallas, TX",741,1018,7,158.0,147700,8019250,"Abilene, TX","Dallas, TX",1990,1,1
1,1990-01-31,"Akron, OH - Atlanta, GA",3742,5610,56,528.0,658558,3087755,"Akron, OH","Atlanta, GA",1990,1,1
2,1990-01-31,"Akron, OH - Birmingham, AL",75,99,1,585.0,658558,958585,"Akron, OH","Birmingham, AL",1990,1,1


In [37]:
flights_count = pd.DataFrame(flights.market_city.value_counts())
flights_count = sorted(flights_count[flights_count['market_city'] == 240].reset_index()['index'])
flights_count[:5]

['Albany, NY - Atlanta, GA',
 'Albany, NY - Chicago, IL',
 'Albany, NY - Detroit, MI',
 'Albany, NY - Philadelphia, PA',
 'Albuquerque, NM - Chicago, IL']

In [38]:
# https://stackoverflow.com/questions/45803676/python-pandas-loc-filter-for-list-of-values
    
flights = flights.loc[flights['market_city'].isin(flights_count)]
print(flights.shape)
flights.head()

(366720, 13)


Unnamed: 0,year-month,market_city,Passengers,Seats,Flights,Distance,Origin Population,Destination Population,city1,city2,year,month,quarter
11,1990-01-31,"Albany, NY - Atlanta, GA",9495,20216,141,852.0,811232,3087755,"Albany, NY","Atlanta, GA",1990,1,1
15,1990-01-31,"Albany, NY - Chicago, IL",11303,22257,172,723.0,811232,16395048,"Albany, NY","Chicago, IL",1990,1,1
18,1990-01-31,"Albany, NY - Detroit, MI",5125,10900,109,488.0,811232,8503650,"Albany, NY","Detroit, MI",1990,1,1
25,1990-01-31,"Albany, NY - Philadelphia, PA",8793,22494,205,212.0,811232,10881988,"Albany, NY","Philadelphia, PA",1990,1,1
33,1990-01-31,"Albuquerque, NM - Chicago, IL",6420,15849,118,1117.0,601893,16395048,"Albuquerque, NM","Chicago, IL",1990,1,1


In [39]:
final = pd.merge(flights, airfare, how='left', on=['market_city', 'quarter', 'year'])
final.head()

Unnamed: 0,year-month_x,market_city,Passengers,Seats,Flights,Distance,Origin Population,Destination Population,city1_x,city2_x,year,month_x,quarter,year-month_y,city1_y,city2_y,fare,month_y
0,1990-01-31,"Albany, NY - Atlanta, GA",9495,20216,141,852.0,811232,3087755,"Albany, NY","Atlanta, GA",1990,1,1,NaT,,,,
1,1990-01-31,"Albany, NY - Chicago, IL",11303,22257,172,723.0,811232,16395048,"Albany, NY","Chicago, IL",1990,1,1,NaT,,,,
2,1990-01-31,"Albany, NY - Detroit, MI",5125,10900,109,488.0,811232,8503650,"Albany, NY","Detroit, MI",1990,1,1,NaT,,,,
3,1990-01-31,"Albany, NY - Philadelphia, PA",8793,22494,205,212.0,811232,10881988,"Albany, NY","Philadelphia, PA",1990,1,1,NaT,,,,
4,1990-01-31,"Albuquerque, NM - Chicago, IL",6420,15849,118,1117.0,601893,16395048,"Albuquerque, NM","Chicago, IL",1990,1,1,NaT,,,,


In [40]:
final.isnull().sum()

year-month_x                   0
market_city                    0
Passengers                     0
Seats                          0
Flights                        0
Distance                       0
Origin Population              0
Destination Population         0
city1_x                        0
city2_x                        0
year                           0
month_x                        0
quarter                        0
year-month_y              302712
city1_y                   302712
city2_y                   302712
fare                      302712
month_y                   302712
dtype: int64

In [41]:
final.columns

Index(['year-month_x', 'market_city', 'Passengers', 'Seats', 'Flights',
       'Distance', 'Origin Population', 'Destination Population', 'city1_x',
       'city2_x', 'year', 'month_x', 'quarter', 'year-month_y', 'city1_y',
       'city2_y', 'fare', 'month_y'],
      dtype='object')

In [42]:
final = final.dropna().drop(columns=['city1_x', 'city2_x', 'year', 'month_x', 'quarter', 'year-month_y', 'city1_y', 'city2_y', 'month_y'])
final.head()

Unnamed: 0,year-month_x,market_city,Passengers,Seats,Flights,Distance,Origin Population,Destination Population,fare
110017,1996-01-31,"Albany, NY - Chicago, IL",10985,18579,175,723.0,825245,17287860,273.9
110020,1996-01-31,"Albuquerque, NM - Chicago, IL",5203,8604,62,1119.5,680994,17287860,156.16
110021,1996-01-31,"Albuquerque, NM - Dallas, TX",43467,69323,494,573.714286,680994,8994450,89.76
110023,1996-01-31,"Albuquerque, NM - Houston, TX",13017,22992,186,750.0,680994,4268132,104.98
110024,1996-01-31,"Albuquerque, NM - Las Vegas, NV",16010,29919,223,487.0,680994,1044023,77.06


In [43]:
new_names = {'Destination Population' : 'pop_dest',
'Distance' : 'dist_miles',
'Flights' : 'num_of_flights',
'Origin Population' : 'pop_origin',
'Passengers' : 'passengers',
'Seats' : 'seat_capacity',
'fare' : 'airfare',
'market_city' : 'route',
'year-month_x' : 'year-month'}

In [44]:
final = final.rename(columns=new_names)
final.head()

Unnamed: 0,year-month,route,passengers,seat_capacity,num_of_flights,dist_miles,pop_origin,pop_dest,airfare
110017,1996-01-31,"Albany, NY - Chicago, IL",10985,18579,175,723.0,825245,17287860,273.9
110020,1996-01-31,"Albuquerque, NM - Chicago, IL",5203,8604,62,1119.5,680994,17287860,156.16
110021,1996-01-31,"Albuquerque, NM - Dallas, TX",43467,69323,494,573.714286,680994,8994450,89.76
110023,1996-01-31,"Albuquerque, NM - Houston, TX",13017,22992,186,750.0,680994,4268132,104.98
110024,1996-01-31,"Albuquerque, NM - Las Vegas, NV",16010,29919,223,487.0,680994,1044023,77.06


In [45]:
final = final.set_index('year-month')
final.head()

Unnamed: 0_level_0,route,passengers,seat_capacity,num_of_flights,dist_miles,pop_origin,pop_dest,airfare
year-month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1996-01-31,"Albany, NY - Chicago, IL",10985,18579,175,723.0,825245,17287860,273.9
1996-01-31,"Albuquerque, NM - Chicago, IL",5203,8604,62,1119.5,680994,17287860,156.16
1996-01-31,"Albuquerque, NM - Dallas, TX",43467,69323,494,573.714286,680994,8994450,89.76
1996-01-31,"Albuquerque, NM - Houston, TX",13017,22992,186,750.0,680994,4268132,104.98
1996-01-31,"Albuquerque, NM - Las Vegas, NV",16010,29919,223,487.0,680994,1044023,77.06


In [46]:
final = pd.merge(final, fuel, how='left', left_index=True, right_index=True)
print(final.shape)
final.head()

(64008, 9)


Unnamed: 0_level_0,route,passengers,seat_capacity,num_of_flights,dist_miles,pop_origin,pop_dest,airfare,fuel_usd_pergallon
year-month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1996-01-31,"Albany, NY - Chicago, IL",10985,18579,175,723.0,825245,17287860,273.9,0.55
1996-01-31,"Albuquerque, NM - Chicago, IL",5203,8604,62,1119.5,680994,17287860,156.16,0.55
1996-01-31,"Albuquerque, NM - Dallas, TX",43467,69323,494,573.714286,680994,8994450,89.76,0.55
1996-01-31,"Albuquerque, NM - Houston, TX",13017,22992,186,750.0,680994,4268132,104.98,0.55
1996-01-31,"Albuquerque, NM - Las Vegas, NV",16010,29919,223,487.0,680994,1044023,77.06,0.55


In [47]:
len(final.route.unique())

381

In [48]:
381 * 168

64008

In [49]:
final.index.max()

Timestamp('2009-12-31 00:00:00')

In [50]:
final.shape

(64008, 9)

In [51]:
# Verify that the each route has the exact same amount of data
print(len(final.index.unique()))
final.route.value_counts()

168


Chicago, IL - Raleigh, NC          168
Atlanta, GA - Austin, TX           168
Albuquerque, NM - Houston, TX      168
Dallas, TX - New Orleans, LA       168
Dallas, TX - Tulsa, OK             168
                                  ... 
Houston, TX - Los Angeles, CA      168
El Paso, TX - San Antonio, TX      168
Houston, TX - San Francisco, CA    168
Dallas, TX - Milwaukee, WI         168
Atlanta, GA - San Antonio, TX      168
Name: route, Length: 381, dtype: int64

In [52]:
final.to_csv('./data/clean/combined.csv')

In [61]:
# Split final dataset into training and testing datasets
# for testing we will remove the airfare column to see how our regression model performs on unseen data
train_data_percentage = 0.80

print(f'First Month: {final.index.min()}')
print(f'Last Month: {final.index.max()}')
print(f'Total Months: {len(final.index.unique())}')

print(f'Number of Months for Training Dataset: {round(len(final.index.unique()) * train_data_percentage)}')
print(f'Number of Months for Testing Dataset: {round(len(final.index.unique()) * (1 - train_data_percentage))}')

First Month: 1996-01-31 00:00:00
Last Month: 2009-12-31 00:00:00
Total Months: 168
Number of Months for Training Dataset: 134
Number of Months for Testing Dataset: 34


In [81]:
print(len(sorted(final.index.unique())[:132]))
print(len(sorted(final.index.unique())[132:]))

132
36


In [82]:
print(sorted(final.index.unique())[0])
print(sorted(final.index.unique())[131])

1996-01-31 00:00:00
2006-12-31 00:00:00


In [83]:
final.loc['1996-01-31':'2006-12-31'].shape

(50292, 9)

In [84]:
print(sorted(final.index.unique())[132])
print(sorted(final.index.unique())[167])

2007-01-31 00:00:00
2009-12-31 00:00:00


In [85]:
final.loc['2007-01-31':'2009-12-31'].shape

(13716, 9)

In [88]:
# splitting combined data into two datasets train and test so I can test my trained model on unseen data!
final.loc['1996-01-31':'2006-12-31'].to_csv('./data/clean/train.csv')
final.loc['2007-01-31':'2009-12-31'].to_csv('./data/clean/test.csv')