# Data Manipulation in Pandas

In this assignment, you will be working on the same dataframe of flights departing New York City in 2013. 

In [3]:
import pandas as pd

In [4]:
# Install the package 'nycflights13' before you can run this
from nycflights13 import flights
flights.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z


In [3]:
flights.shape

(336776, 19)

## Data frame with columns

- year,month,day
        Date of departure    
- dep_time,arr_time
        Actual departure and arrival times (format HHMM or HMM), local tz.
- sched_dep_time,sched_arr_time
        Scheduled departure and arrival times (format HHMM or HMM), local tz.    
- dep_delay,arr_delay
        Departure and arrival delays, in minutes. Negative times represent early departures/arrivals.
- hour,minute
        Time of scheduled departure broken into hour and minutes.
- carrier
        Two letter carrier abbreviation. See airlines() to get name
- tailnum
        Plane tail number
- flight
        Flight number
- origin,dest
        Origin and destination. See airports() for additional metadata.
- air_time
        Amount of time spent in the air, in minutes
- distance
        Distance between airports, in miles
- time_hour
        Scheduled date and hour of the flight as a date. Along with origin, can be used to join flights data to weather data.

In [4]:
# use describe() to summarize all columns
#flights.describe(include='all')
flights.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z


## Question 1. Selecting rows

From the 'flights' dataframe, find all flights that satisfy the following certain conditions:

In [5]:
# Had an arrival delay of two or more hours
flights.query("arr_delay>120")

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
119,2013,1,1,811.0,630,101.0,1047.0,830,137.0,MQ,4576,N531MQ,LGA,CLT,118.0,544,6,30,2013-01-01T11:00:00Z
151,2013,1,1,848.0,1835,853.0,1001.0,1950,851.0,MQ,3944,N942MQ,JFK,BWI,41.0,184,18,35,2013-01-01T23:00:00Z
218,2013,1,1,957.0,733,144.0,1056.0,853,123.0,UA,856,N534UA,EWR,BOS,37.0,200,7,33,2013-01-01T12:00:00Z
268,2013,1,1,1114.0,900,134.0,1447.0,1222,145.0,UA,1086,N76502,LGA,IAH,248.0,1416,9,0,2013-01-01T14:00:00Z
447,2013,1,1,1505.0,1310,115.0,1638.0,1431,127.0,EV,4497,N17984,EWR,RIC,63.0,277,13,10,2013-01-01T18:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336579,2013,9,30,1823.0,1545,158.0,1934.0,1733,121.0,9E,3459,N916XJ,JFK,BNA,95.0,765,15,45,2013-09-30T19:00:00Z
336668,2013,9,30,1951.0,1649,182.0,2157.0,1903,174.0,EV,4294,N13988,EWR,SAV,95.0,708,16,49,2013-09-30T20:00:00Z
336724,2013,9,30,2053.0,1815,158.0,2310.0,2054,136.0,EV,5292,N600QX,EWR,ATL,91.0,746,18,15,2013-09-30T22:00:00Z
336757,2013,9,30,2159.0,1845,194.0,2344.0,2030,194.0,9E,3320,N906XJ,JFK,BUF,50.0,301,18,45,2013-09-30T22:00:00Z


In [6]:
# Flew to Houston (IAH or HOU)
flights[(flights.dest=='IAH')| (flights.dest=="HOU")]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
32,2013,1,1,623.0,627,-4.0,933.0,932,1.0,UA,496,N459UA,LGA,IAH,229.0,1416,6,27,2013-01-01T11:00:00Z
81,2013,1,1,728.0,732,-4.0,1041.0,1038,3.0,UA,473,N488UA,LGA,IAH,238.0,1416,7,32,2013-01-01T12:00:00Z
89,2013,1,1,739.0,739,0.0,1104.0,1038,26.0,UA,1479,N37408,EWR,IAH,249.0,1400,7,39,2013-01-01T12:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336524,2013,9,30,1729.0,1720,9.0,2001.0,2010,-9.0,UA,652,N455UA,EWR,IAH,173.0,1400,17,20,2013-09-30T21:00:00Z
336527,2013,9,30,1735.0,1715,20.0,2010.0,2005,5.0,WN,2067,N296WN,EWR,HOU,188.0,1411,17,15,2013-09-30T21:00:00Z
336618,2013,9,30,1859.0,1859,0.0,2134.0,2159,-25.0,UA,1128,N14731,LGA,IAH,180.0,1416,18,59,2013-09-30T22:00:00Z
336694,2013,9,30,2015.0,2015,0.0,2244.0,2307,-23.0,UA,1545,N17730,EWR,IAH,174.0,1400,20,15,2013-10-01T00:00:00Z


In [7]:
# Were operated by United (UA), American (AA), or Delta (DL)
flights[(flights.carrier=="UA")|(flights.carrier=="AA")|(flights.carrier=="DL")]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z
5,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01T10:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336737,2013,9,30,2105.0,2106,-1.0,2329.0,2354,-25.0,UA,475,N477UA,EWR,IAH,175.0,1400,21,6,2013-10-01T01:00:00Z
336744,2013,9,30,2121.0,2100,21.0,2349.0,14,-25.0,DL,2363,N193DN,JFK,LAX,296.0,2475,21,0,2013-10-01T01:00:00Z
336751,2013,9,30,2140.0,2140,0.0,10.0,40,-30.0,AA,185,N335AA,JFK,LAX,298.0,2475,21,40,2013-10-01T01:00:00Z
336755,2013,9,30,2149.0,2156,-7.0,2245.0,2308,-23.0,UA,523,N813UA,EWR,BOS,37.0,200,21,56,2013-10-01T01:00:00Z


In [8]:
# Departed in July, August, and September
flights[flights.month.isin([7,8,9])]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
250450,2013,7,1,1.0,2029,212.0,236.0,2359,157.0,B6,915,N653JB,JFK,SFO,315.0,2586,20,29,2013-07-02T00:00:00Z
250451,2013,7,1,2.0,2359,3.0,344.0,344,0.0,B6,1503,N805JB,JFK,SJU,200.0,1598,23,59,2013-07-02T03:00:00Z
250452,2013,7,1,29.0,2245,104.0,151.0,1,110.0,B6,234,N348JB,JFK,BTV,66.0,266,22,45,2013-07-02T02:00:00Z
250453,2013,7,1,43.0,2130,193.0,322.0,14,188.0,B6,1371,N794JB,LGA,FLL,143.0,1076,21,30,2013-07-02T01:00:00Z
250454,2013,7,1,44.0,2150,174.0,300.0,100,120.0,AA,185,N324AA,JFK,LAX,297.0,2475,21,50,2013-07-02T01:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30T18:00:00Z
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-10-01T02:00:00Z
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30T16:00:00Z
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30T15:00:00Z


In [9]:
# Arrived more than two hours late, but didn’t leave late
flights[(flights.arr_delay>120) & (flights.dep_delay<0)]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
22911,2013,1,27,1419.0,1420,-1.0,1754.0,1550,124.0,MQ,3728,N1EAMQ,EWR,ORD,135.0,719,14,20,2013-01-27T19:00:00Z
33019,2013,10,7,1357.0,1359,-2.0,1858.0,1654,124.0,AA,1151,N3CMAA,LGA,DFW,192.0,1389,13,59,2013-10-07T17:00:00Z
41075,2013,10,16,657.0,700,-3.0,1258.0,1056,122.0,B6,3,N703JB,JFK,SJU,225.0,1598,7,0,2013-10-16T11:00:00Z
55985,2013,11,1,658.0,700,-2.0,1329.0,1015,194.0,VX,399,N629VA,JFK,LAX,336.0,2475,7,0,2013-11-01T11:00:00Z
152766,2013,3,18,1844.0,1847,-3.0,39.0,2219,140.0,UA,389,N560UA,JFK,SFO,386.0,2586,18,47,2013-03-18T22:00:00Z
180893,2013,4,17,1635.0,1640,-5.0,2049.0,1845,124.0,MQ,4540,N721MQ,LGA,DTW,130.0,502,16,40,2013-04-17T20:00:00Z
181270,2013,4,18,558.0,600,-2.0,1149.0,850,179.0,AA,707,N3EXAA,LGA,DFW,234.0,1389,6,0,2013-04-18T10:00:00Z
181327,2013,4,18,655.0,700,-5.0,1213.0,950,143.0,AA,2083,N565AA,EWR,DFW,230.0,1372,7,0,2013-04-18T11:00:00Z
213693,2013,5,22,1827.0,1830,-3.0,2217.0,2010,127.0,MQ,4674,N518MQ,LGA,CLE,90.0,419,18,30,2013-05-22T22:00:00Z
226434,2013,6,5,1604.0,1615,-11.0,2041.0,1840,121.0,MQ,4657,N510MQ,LGA,ATL,158.0,762,16,15,2013-06-05T20:00:00Z


In [5]:
# Were delayed by at least an hour, but made up over 30 minutes in flight
flights[(flights.dep_delay>60)& (flights.arr_delay<(flights.dep_delay-30))]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
815,2013,1,1,2205.0,1720,285.0,46.0,2040,246.0,AA,1999,N5DNAA,EWR,MIA,146.0,1085,17,20,2013-01-01T22:00:00Z
832,2013,1,1,2326.0,2130,116.0,131.0,18,73.0,B6,199,N594JB,JFK,LAS,290.0,2248,21,30,2013-01-02T02:00:00Z
2286,2013,1,3,1503.0,1221,162.0,1803.0,1555,128.0,UA,551,N835UA,EWR,SFO,320.0,2565,12,21,2013-01-03T17:00:00Z
2508,2013,1,3,1839.0,1700,99.0,2056.0,1950,66.0,AA,575,N631AA,JFK,EGE,239.0,1747,17,0,2013-01-03T22:00:00Z
2522,2013,1,3,1850.0,1745,65.0,2148.0,2120,28.0,AA,177,N332AA,JFK,SFO,314.0,2586,17,45,2013-01-03T22:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336572,2013,9,30,1818.0,1715,63.0,2001.0,1940,21.0,9E,3310,N934XJ,JFK,MCI,142.0,1113,17,15,2013-09-30T21:00:00Z
336579,2013,9,30,1823.0,1545,158.0,1934.0,1733,121.0,9E,3459,N916XJ,JFK,BNA,95.0,765,15,45,2013-09-30T19:00:00Z
336674,2013,9,30,1956.0,1825,91.0,2208.0,2121,47.0,DL,1576,N3772H,JFK,SAN,292.0,2446,18,25,2013-09-30T22:00:00Z
336704,2013,9,30,2028.0,1910,78.0,2255.0,2215,40.0,AA,21,N338AA,JFK,LAX,294.0,2475,19,10,2013-09-30T23:00:00Z


In [10]:
# Departed between midnight and 6am (inclusive)
flights[(flights.dep_time > 0) & (flights.dep_time < 601)] 

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335802,2013,9,30,557.0,600,-3.0,852.0,923,-31.0,UA,303,N510UA,JFK,SFO,326.0,2586,6,0,2013-09-30T10:00:00Z
335803,2013,9,30,558.0,600,-2.0,815.0,829,-14.0,EV,4137,N16981,EWR,ATL,107.0,746,6,0,2013-09-30T10:00:00Z
335804,2013,9,30,558.0,600,-2.0,742.0,749,-7.0,DL,731,N337NB,LGA,DTW,83.0,502,6,0,2013-09-30T10:00:00Z
335805,2013,9,30,559.0,600,-1.0,,715,,WN,464,N411WN,EWR,MDW,,711,6,0,2013-09-30T10:00:00Z


In [11]:
# How many flights have a missing dep_time? 
flights[flights.dep_time.isnull()]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
838,2013,1,1,,1630,,,1815,,EV,4308,N18120,EWR,RDU,,416,16,30,2013-01-01T21:00:00Z
839,2013,1,1,,1935,,,2240,,AA,791,N3EHAA,LGA,DFW,,1389,19,35,2013-01-02T00:00:00Z
840,2013,1,1,,1500,,,1825,,AA,1925,N3EVAA,LGA,MIA,,1096,15,0,2013-01-01T20:00:00Z
841,2013,1,1,,600,,,901,,B6,125,N618JB,JFK,FLL,,1069,6,0,2013-01-01T11:00:00Z
1777,2013,1,2,,1540,,,1747,,EV,4352,N10575,EWR,CVG,,569,15,40,2013-01-02T20:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30T18:00:00Z
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-10-01T02:00:00Z
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30T16:00:00Z
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30T15:00:00Z


## Question 2. Sorting

In [12]:
# Sort flights to find the least delayed flights. Find the flights that left earliest.
flights.sort_values('dep_delay', ascending=True)

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
89673,2013,12,7,2040.0,2123,-43.0,40.0,2352,48.0,B6,97,N592JB,JFK,DEN,265.0,1626,21,23,2013-12-08T02:00:00Z
113633,2013,2,3,2022.0,2055,-33.0,2240.0,2338,-58.0,DL,1715,N612DL,LGA,MSY,162.0,1183,20,55,2013-02-04T01:00:00Z
64501,2013,11,10,1408.0,1440,-32.0,1549.0,1559,-10.0,EV,5713,N825AS,LGA,IAD,52.0,229,14,40,2013-11-10T19:00:00Z
9619,2013,1,11,1900.0,1930,-30.0,2233.0,2243,-10.0,DL,1435,N934DL,LGA,TPA,139.0,1010,19,30,2013-01-12T00:00:00Z
24915,2013,1,29,1703.0,1730,-27.0,1947.0,1957,-10.0,F9,837,N208FR,LGA,DEN,250.0,1620,17,30,2013-01-29T22:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30T18:00:00Z
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-10-01T02:00:00Z
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30T16:00:00Z
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30T15:00:00Z


In [13]:
# Which flights travelled the farthest? Which travelled the shortest?
flights.sort_values('distance',ascending=False)

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
50676,2013,10,26,1004.0,1000,4.0,1435.0,1450,-15.0,HA,51,N386HA,JFK,HNL,608.0,4983,10,0,2013-10-26T14:00:00Z
108078,2013,12,28,933.0,930,3.0,1520.0,1535,-15.0,HA,51,N384HA,JFK,HNL,633.0,4983,9,30,2013-12-28T14:00:00Z
100067,2013,12,19,924.0,930,-6.0,1450.0,1535,-45.0,HA,51,N386HA,JFK,HNL,609.0,4983,9,30,2013-12-19T14:00:00Z
179566,2013,4,16,953.0,1000,-7.0,1443.0,1510,-27.0,HA,51,N381HA,JFK,HNL,631.0,4983,10,0,2013-04-16T14:00:00Z
30229,2013,10,4,954.0,1000,-6.0,1438.0,1450,-12.0,HA,51,N380HA,JFK,HNL,618.0,4983,10,0,2013-10-04T14:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6972,2013,1,8,2127.0,2130,-3.0,2304.0,2225,39.0,EV,4619,N11194,EWR,PHL,30.0,80,21,30,2013-01-09T02:00:00Z
26874,2013,1,31,2129.0,2129,0.0,2234.0,2224,10.0,EV,3271,N13553,EWR,PHL,28.0,80,21,29,2013-02-01T02:00:00Z
164109,2013,3,30,1942.0,1950,-8.0,2026.0,2044,-18.0,EV,4457,N12569,EWR,PHL,24.0,80,19,50,2013-03-30T23:00:00Z
22334,2013,1,26,1621.0,1617,4.0,1705.0,1722,-17.0,EV,4616,N14153,EWR,PHL,27.0,80,16,17,2013-01-26T21:00:00Z


## Question 3. Selecting Columns

Use at least three ways to select dep_time, dep_delay, arr_time, and arr_delay from flights.

In [14]:
# Method 1
flights[['dep_time','dep_delay','arr_time','arr_delay']]

Unnamed: 0,dep_time,dep_delay,arr_time,arr_delay
0,517.0,2.0,830.0,11.0
1,533.0,4.0,850.0,20.0
2,542.0,2.0,923.0,33.0
3,544.0,-1.0,1004.0,-18.0
4,554.0,-6.0,812.0,-25.0
...,...,...,...,...
336771,,,,
336772,,,,
336773,,,,
336774,,,,


In [15]:
# Method 2
flights.iloc[:,[3,4,5,6]]

Unnamed: 0,dep_time,sched_dep_time,dep_delay,arr_time
0,517.0,515,2.0,830.0
1,533.0,529,4.0,850.0
2,542.0,540,2.0,923.0
3,544.0,545,-1.0,1004.0
4,554.0,600,-6.0,812.0
...,...,...,...,...
336771,,1455,,
336772,,2200,,
336773,,1210,,
336774,,1159,,


In [16]:
# Method 3
flights.loc[:,'dep_time':'arr_time']

Unnamed: 0,dep_time,sched_dep_time,dep_delay,arr_time
0,517.0,515,2.0,830.0
1,533.0,529,4.0,850.0
2,542.0,540,2.0,923.0
3,544.0,545,-1.0,1004.0
4,554.0,600,-6.0,812.0
...,...,...,...,...
336771,,1455,,
336772,,2200,,
336773,,1210,,
336774,,1159,,


## Question 4. Adding new columns

Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. 

For example, 759 means 7:59 and 801 means 8:01. Their difference is not 42 but 2 minutes. 

In [26]:
# Convert them to a more convenient representation of number of minutes since midnight (0).
def time_fix(time):
    hour=time // 100
    mins = time % 100
    return (60*hour + mins)


flights.sched_dep_time=time_fix(flights.sched_dep_time)
flights.dep_time=time_fix(flights.dep_time)
flights.arr_time=time_fix(flights.arr_time)
flights.sched_arr_time=time_fix(flights.sched_arr_time)


In [27]:
# Create a new column of arr_time - dep_time. 
flights['new_air_time']=flights.arr_time - flights.dep_time

# Compare this column with air_time. 
flights[['air_time','new_air_time']]

Unnamed: 0,air_time,new_air_time
0,227.0,193.0
1,227.0,197.0
2,160.0,221.0
3,183.0,260.0
4,116.0,138.0
...,...,...
336771,,
336772,,
336773,,
336774,,


In [30]:
# Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?
# Try creating a column to calculate dep_delay from dep_time and sched_dep_time (and/or other columns if necessary). 

# Test your results. 
flights['new_dep_delay']=flights.dep_time-flights.sched_dep_time

flights[['dep_time','sched_dep_time','dep_delay','new_dep_delay']]

Unnamed: 0,dep_time,sched_dep_time,dep_delay,new_dep_delay
0,317.0,315,2.0,2.0
1,333.0,329,4.0,4.0
2,342.0,340,2.0,2.0
3,344.0,345,-1.0,-1.0
4,354.0,360,-6.0,-6.0
...,...,...,...,...
336771,,895,,
336772,,1320,,
336773,,730,,
336774,,719,,


## Question 5. Mixing things together

The following questions may require multiple operations above. 

In [39]:
# Find the 20 most delayed flights. 
# Display the following: year,month,day,carrier,flight,dep_delay,arr_delay,carrier
# How do you want to handle ties? 
display=flights.sort_values('dep_delay',ascending=False)

display=display.iloc[0:19,:]
display[['year','month','day','carrier','flight','dep_delay','arr_delay','carrier']]

Unnamed: 0,year,month,day,carrier,flight,dep_delay,arr_delay,carrier.1
7072,2013,1,9,HA,51,1301.0,1272.0,HA
235778,2013,6,15,MQ,3535,1137.0,1127.0,MQ
8239,2013,1,10,MQ,3695,1126.0,1109.0,MQ
327043,2013,9,20,AA,177,1014.0,1007.0,AA
270376,2013,7,22,MQ,3075,1005.0,989.0,MQ
173992,2013,4,10,DL,2391,960.0,931.0,DL
151974,2013,3,17,DL,2119,911.0,915.0,DL
247040,2013,6,27,DL,2007,899.0,850.0,DL
270987,2013,7,22,DL,2047,898.0,895.0,DL
87238,2013,12,5,AA,172,896.0,878.0,AA


In [48]:
# Sort all AA flights to find the top 10 fastest (highest speed) flights.
# Display the following: year,month,day,carrier,flight,orig,dest,distance,air_time,speed (miles per hour)
flights['speed']=flights.distance/(flights.air_time/60)
data=flights[(flights.carrier=='AA')]


data=data.sort_values('speed',ascending=False)
data=data.iloc[0:10,:]
data


Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,...,origin,dest,air_time,distance,hour,minute,time_hour,new_air_time,new_dep_delay,speed
70704,2013,11,17,479.0,480,-1.0,732.0,775,-43.0,AA,...,JFK,STT,175.0,1623,8,0,2013-11-17T13:00:00Z,253.0,-1.0,556.457143
87723,2013,12,5,1138.0,1140,-2.0,1393.0,1439,-46.0,AA,...,JFK,SJU,173.0,1598,19,0,2013-12-06T00:00:00Z,255.0,-2.0,554.219653
119700,2013,2,10,1198.0,1135,63.0,31.0,1435,36.0,AA,...,JFK,SJU,173.0,1598,18,55,2013-02-10T23:00:00Z,-1167.0,63.0,554.219653
129686,2013,2,21,1156.0,1135,21.0,1440.0,1435,5.0,AA,...,JFK,SJU,174.0,1598,18,55,2013-02-21T23:00:00Z,284.0,21.0,551.034483
87575,2013,12,5,988.0,990,-2.0,1245.0,1290,-45.0,AA,...,JFK,SJU,175.0,1598,16,30,2013-12-05T21:00:00Z,257.0,-2.0,547.885714
70721,2013,11,17,495.0,500,-5.0,754.0,810,-56.0,AA,...,JFK,SJU,176.0,1598,8,20,2013-11-17T13:00:00Z,259.0,-5.0,544.772727
144408,2013,3,9,1142.0,1135,7.0,1424.0,1435,-11.0,AA,...,JFK,SJU,176.0,1598,18,55,2013-03-09T23:00:00Z,282.0,7.0,544.772727
24894,2013,1,29,1009.0,950,59.0,1264.0,1250,14.0,AA,...,JFK,SJU,177.0,1598,15,50,2013-01-29T20:00:00Z,255.0,59.0,541.694915
119894,2013,2,11,393.0,390,3.0,683.0,700,-17.0,AA,...,JFK,SJU,177.0,1598,6,30,2013-02-11T11:00:00Z,290.0,3.0,541.694915
70335,2013,11,16,923.0,910,13.0,1194.0,1220,-26.0,AA,...,JFK,SJU,177.0,1598,15,10,2013-11-16T20:00:00Z,271.0,13.0,541.694915


In [69]:
# Find all flights that satisfy the following:
## - From John F. Kennedy Airpot (JFK) or Newark Aiport (EWR) to Seattle-Tacoma Airport (SEA) 
## - Carrier is UA, AA, or DL. 
## - Dates from 4/1/2013 (inclusive) to 4/3/2013 (inclusive)
## - Scheduled arrival time is before noon. 
# - Display the following: year,month,day,carrier,flight,origin,dest,sched_dep_time,sched_arr_time
# - Sort by year, month, day, sched_arr_time

data=flights[((flights.origin.isin(['JFK','EWR']))&(flights.dest=='SEA')&
              (flights.month==4)&(flights.day<4)&
              (flights.sched_arr_time<720)&(flights.carrier.isin(['UA','AA','DL'])))]
data=data.filter(['year','month','day','carrier','flight','origin','dest','sched_dep_time','sched_arr_time'])
data=data.sort_values(by=['year', 'month', 'day', 'sched_arr_time'])
data

Unnamed: 0,year,month,day,carrier,flight,origin,dest,sched_dep_time,sched_arr_time
165210,2013,4,1,DL,183,JFK,SEA,465,660
166180,2013,4,2,DL,183,JFK,SEA,465,660
167168,2013,4,3,DL,183,JFK,SEA,465,660
