In [1]:
import numpy as np
import scipy as sp
import scipy.sparse

import pandas as pd

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

def spy(A, figsize=(6, 6), markersize=0.5):
    """Visualizes a sparse matrix."""
    fig = plt.figure(figsize=figsize)
    plt.spy(A, markersize=markersize)
    plt.show()

In [3]:
from IPython.display import display, Markdown # For pretty-printing tibbles

In [4]:
def canonicalize_tibble(X):
    var_names = sorted(X.columns)
    Y = X[var_names].copy()
    Y.sort_values(by=var_names, inplace=True)
    Y.reset_index(drop=True, inplace=True)
    return Y

def tibbles_are_equivalent (A, B):
    A_canonical = canonicalize_tibble(A)
    B_canonical = canonicalize_tibble(B)
    cmp = A_canonical.eq(B_canonical)
    return cmp.all().all()

In [5]:
import requests
import os
import hashlib
import io

def on_vocareum():
    return os.path.exists('.voc')

def download(file, local_dir="", url_base=None, checksum=None):
    local_file = "{}{}".format(local_dir, file)
    if not os.path.exists(local_file):
        if url_base is None:
            url_base = "https://cse6040.gatech.edu/datasets/"
        url = "{}{}".format(url_base, file)
        print("Downloading: {} ...".format(url))
        r = requests.get(url)
        with open(local_file, 'wb') as f:
            f.write(r.content)
            
    if checksum is not None:
        with io.open(local_file, 'rb') as f:
            body = f.read()
            body_checksum = hashlib.md5(body).hexdigest()
            assert body_checksum == checksum, \
                "Downloaded file '{}' has incorrect checksum: '{}' instead of '{}'".format(local_file,
                                                                                           body_checksum,
                                                                                           checksum)
    print("'{}' is ready!".format(file))
    
if on_vocareum():
    URL_BASE = "https://cse6040.gatech.edu/datasets/us-flights/"
    DATA_PATH = "../resource/asnlib/publicdata/"
else:
    URL_BASE = "https://github.com/cse6040/labs-fa17/raw/master/lab11-markov_chains/"
    DATA_PATH = ""

datasets = {'L_AIRPORT_ID.csv': 'e9f250e3c93d625cce92d08648c4bbf0',
            'L_CITY_MARKET_ID.csv': 'f430a16a5fe4b9a849accb5d332b2bb8',
            'L_UNIQUE_CARRIERS.csv': 'bebe919e85e2cf72e7041dbf1ae5794e',
            'us-flights--2017-08.csv': 'eeb259c0cdd00ff1027261ca0a7c0332',
            'flights_atl_to_lax_soln.csv': '4591f6501411de90af72693cdbcc08bb',
            'origins_top10_soln.csv': 'de85c321c45c7bf65612754be4567086',
            'dests_soln.csv': '370f4c632623616b3bf26b6f79993fe4',
            'dests_top10_soln.csv': '4c7dd7edf48c4d62466964d6b8c14184',
            'segments_soln.csv': '516a78d2d9d768d78bfb012b77671f38',
            'segments_outdegree_soln.csv': 'b29d60151c617ebafd3a1c58541477c8'
           }

for filename, checksum in datasets.items():
    download(filename, local_dir=DATA_PATH, url_base=URL_BASE, checksum=checksum)
    
print("\n(All data appears to be ready.)")

Downloading: https://github.com/cse6040/labs-fa17/raw/master/lab11-markov_chains/flights_atl_to_lax_soln.csv ...
'flights_atl_to_lax_soln.csv' is ready!
Downloading: https://github.com/cse6040/labs-fa17/raw/master/lab11-markov_chains/L_CITY_MARKET_ID.csv ...
'L_CITY_MARKET_ID.csv' is ready!
Downloading: https://github.com/cse6040/labs-fa17/raw/master/lab11-markov_chains/origins_top10_soln.csv ...
'origins_top10_soln.csv' is ready!
Downloading: https://github.com/cse6040/labs-fa17/raw/master/lab11-markov_chains/L_UNIQUE_CARRIERS.csv ...
'L_UNIQUE_CARRIERS.csv' is ready!
Downloading: https://github.com/cse6040/labs-fa17/raw/master/lab11-markov_chains/L_AIRPORT_ID.csv ...
'L_AIRPORT_ID.csv' is ready!
Downloading: https://github.com/cse6040/labs-fa17/raw/master/lab11-markov_chains/segments_outdegree_soln.csv ...
'segments_outdegree_soln.csv' is ready!
Downloading: https://github.com/cse6040/labs-fa17/raw/master/lab11-markov_chains/segments_soln.csv ...
'segments_soln.csv' is ready!
Downloa

In [7]:
airport_codes = pd.read_csv("{}{}".format(DATA_PATH, 'L_AIRPORT_ID.csv'))

In [14]:
#airport_codes.describe()
#airport_codes.head()

In [26]:
flights = pd.read_csv('us-flights--2017-08.csv')
del flights['Unnamed: 7']
print 'Number of flight segments: {}'.format(len(flights))
flights.head()

Number of flight segments: 510451


Unnamed: 0,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID
0,2017-08-01,DL,2,12478,31703,14679,33570
1,2017-08-01,DL,4,12889,32211,12478,31703
2,2017-08-01,DL,6,12892,32575,14869,34614
3,2017-08-01,DL,7,14869,34614,12892,32575
4,2017-08-01,DL,10,11292,30325,13487,31650


In [63]:
ATL = str(airport_codes.loc[airport_codes['Description'].str.contains('Hartsfield-Jackson')]['Description']) #--- pandas.Series.str.contains()

str(ATL)

'373    Atlanta, GA: Hartsfield-Jackson Atlanta Intern...\nName: Description, dtype: object'

In [30]:
# PART A) Define `ATL_ID` and `LAX_ID` to correspond to the
# codes in `airport_codes` for ATL and LAX, respectively.
###
### YOUR CODE HERE
###
airport_codes.loc["Atlanta" in airport_codes['Description']]



# Print the descriptions of the airports with your IDs:
ATL_DESC = airport_codes[airport_codes['Code'] == ATL_ID]['Description'].iloc[0]
LAX_DESC = airport_codes[airport_codes['Code'] == LAX_ID]['Description'].iloc[0]
print("{}: ATL -- {}".format(ATL_ID, ATL_DESC))
print("{}: LAX -- {}".format(LAX_ID, LAX_DESC))

KeyError: False