In [33]:
import pandas as pd

flights_df = pd.read_csv('flights.csv',low_memory=False)
airports_df = pd.read_csv('airports.csv')
airlines_df = pd.read_csv('airlines.csv')

In [2]:
flights_df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [3]:
airports_df.head()

Unnamed: 0,airport_code,airport,city,state,country,latitude,longitude
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [4]:
airlines_df.head()

Unnamed: 0,airline_code,airline
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways


## Scoring for number of flights offered per airline

In [5]:
merged_df = flights_df.merge(airports_df, left_on='ORIGIN_AIRPORT', right_on='airport_code')

In [6]:
grouped_df = merged_df.groupby('airport').agg({'AIRLINE': 'nunique'})
grouped_df.rename(columns={'AIRLINE': 'airlines_quantity'}, inplace=True)

In [38]:
grouped_df['num_airlines_score'] = grouped_df['airlines_quantity'].transform(lambda x: pd.cut(x, bins=100, labels=False) + 1)

In [39]:
sorted_df = grouped_df.sort_values(by='airlines_quantity', ascending=False)

In [44]:
sorted_df.head(30)

Unnamed: 0_level_0,airlines_quantity,num_airlines_score
airport,Unnamed: 1_level_1,Unnamed: 2_level_1
Los Angeles International Airport,13,100
LaGuardia Airport (Marine Air Terminal),12,92
Louis Armstrong New Orleans International Airport,12,92
Chicago O'Hare International Airport,12,92
Ronald Reagan Washington National Airport,12,92
Philadelphia International Airport,12,92
San Diego International Airport (Lindbergh Field),12,92
Phoenix Sky Harbor International Airport,12,92
Portland International Airport,12,92
McCarran International Airport,12,92


In [9]:
sorted_df.value_counts('num_airlines_score')

num_airlines_score
1     106
2      43
4      37
5      31
9      17
7      15
14     15
17     15
12     13
15     10
19     10
10      9
20      1
dtype: int64

In [10]:
sorted_df.value_counts('airlines_quantity')

airlines_quantity
1     106
2      43
3      37
4      31
6      17
5      15
9      15
11     15
8      13
10     10
12     10
7       9
13      1
dtype: int64

## Scoring for quantity of flights per airport

In [36]:
import pandas as pd

# load data
airport_cancellations = pd.read_csv('airport_cancellations.csv')

# select columns and calculate percentiles
df = airport_cancellations[['origin_airport', 'flights_out_of', 'flights_into']].copy()
df['flight_out_of_score'] = pd.qcut(df['flights_out_of'], q=100, labels=False, duplicates='drop') + 1
df['flights_into_score'] = pd.qcut(df['flights_into'], q=100, labels=False, duplicates='drop') + 1

# order by flights_out_of in descending order
df = df.sort_values('flights_out_of', ascending=False)

# rename columns and reset index
df = df.rename(columns={'origin_airport': 'airport', 'flights_out_of': 'flights_out_of_score', 'flights_into': 'flights_into_score'}).reset_index(drop=True)

print(df)

                                              airport  flights_out_of_score  \
0    Hartsfield-Jackson Atlanta International Airport                346836   
1                Chicago O'Hare International Airport                285884   
2             Dallas/Fort Worth International Airport                239551   
3                        Denver International Airport                196055   
4                   Los Angeles International Airport                194673   
..                                                ...                   ...   
317                        St. Cloud Regional Airport                    83   
318                                Dillingham Airport                    77   
319                                  Gustavus Airport                    77   
320                               King Salmon Airport                    63   
321                  Ithaca Tompkins Regional Airport                    34   

     flights_into_score  flight_out_of_score  fligh

In [37]:
df

Unnamed: 0,airport,flights_out_of_score,flights_into_score,flight_out_of_score,flights_into_score.1
0,Hartsfield-Jackson Atlanta International Airport,346836,346904,100,100
1,Chicago O'Hare International Airport,285884,285906,100,100
2,Dallas/Fort Worth International Airport,239551,239582,100,100
3,Denver International Airport,196055,196010,100,100
4,Los Angeles International Airport,194673,194696,99,99
...,...,...,...,...,...
317,St. Cloud Regional Airport,83,82,2,2
318,Dillingham Airport,77,77,1,1
319,Gustavus Airport,77,77,1,1
320,King Salmon Airport,63,63,1,1


In [35]:
import pandas as pd

# count flights by airline
airline_counts = flights.groupby('AIRLINE').size().reset_index(name='count')

# join with airlines dataframe to get airline names
airline_counts = airline_counts.merge(airlines, left_on='AIRLINE', right_on='airline_code')

# compute percentage and ntile
total_flights = len(flights)
airline_counts['percentage'] = airline_counts['count'] / total_flights * 100
airline_counts['ntile'] = pd.qcut(airline_counts[['count']].reset_index(drop=True)['count'].values, 100, labels=False, duplicates='drop') + 1

# select and order columns
airline_counts = airline_counts[['AIRLINE', 'count', 'percentage', 'ntile']]
airline_counts = airline_counts.sort_values('count', ascending=False)

print(airline_counts)

   AIRLINE    count  percentage  ntile
13      WN  1261855   21.684789    100
3       DL   875881   15.051884     93
0       AA   725984   12.475926     85
9       OO   588353   10.110758     77
4       EV   571977    9.829339     70
10      UA   515723    8.862622     62
7       MQ   294632    5.063207     54
2       B6   267048    4.589180     47
11      US   198715    3.414887     39
1       AS   172521    2.964748     31
8       NK   117379    2.017141     24
5       F9    90836    1.561003     16
6       HA    76272    1.310723      8
12      VX    61903    1.063794      1
