In [1]:
import urllib.request

In [2]:
file_name = "https://s3.amazonaws.com/h2o-airlines-unpacked/year2012.csv"

In [3]:
# Download file, per https://docs.databricks.com/_static/notebooks/zip-files-python.html:
urllib.request.urlretrieve(file_name, '/tmp/df.csv')

# Move file per to DBFS, per https://docs.databricks.com/_static/notebooks/zip-files-python.html:
dbutils.fs.mv("file:/tmp/df.csv", "dbfs:/data/df.csv")

In [4]:
# Per https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#datasets-and-dataframes, 'spark' is an existing SparkSession:
df_spark = spark.read.format('csv').options(header='true', inferSchema='true').load('dbfs:/data/df.csv')

In [5]:
display(df_spark)

Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed,IsDepDelayed
2012,1,1,7,855.0,900,1142.0,1225,AA,1,N325AA,347.0,385,330.0,-43.0,-5.0,JFK,LAX,2475,4.0,13.0,0,,0,0,0,0,0,0,NO,NO
2012,1,2,1,921.0,900,1210.0,1225,AA,1,N319AA,349.0,385,325.0,-15.0,21.0,JFK,LAX,2475,11.0,13.0,0,,0,0,0,0,0,0,NO,YES
2012,1,3,2,931.0,900,1224.0,1225,AA,1,N323AA,353.0,385,319.0,-1.0,31.0,JFK,LAX,2475,22.0,12.0,0,,0,0,0,0,0,0,NO,YES
2012,1,4,3,904.0,900,1151.0,1225,AA,1,N320AA,347.0,385,309.0,-34.0,4.0,JFK,LAX,2475,20.0,18.0,0,,0,0,0,0,0,0,NO,YES
2012,1,5,4,858.0,900,1142.0,1225,AA,1,N338AA,344.0,385,306.0,-43.0,-2.0,JFK,LAX,2475,22.0,16.0,0,,0,0,0,0,0,0,NO,NO
2012,1,6,5,911.0,900,1151.0,1225,AA,1,N319AA,340.0,385,321.0,-34.0,11.0,JFK,LAX,2475,5.0,14.0,0,,0,0,0,0,0,0,NO,YES
2012,1,7,6,902.0,900,1203.0,1225,AA,1,N321AA,361.0,385,337.0,-22.0,2.0,JFK,LAX,2475,4.0,20.0,0,,0,0,0,0,0,0,NO,YES
2012,1,8,7,855.0,900,1129.0,1225,AA,1,N322AA,334.0,385,318.0,-56.0,-5.0,JFK,LAX,2475,4.0,12.0,0,,0,0,0,0,0,0,NO,NO
2012,1,9,1,858.0,900,1127.0,1225,AA,1,N335AA,329.0,385,307.0,-58.0,-2.0,JFK,LAX,2475,9.0,13.0,0,,0,0,0,0,0,0,NO,NO
2012,1,10,2,852.0,900,1134.0,1225,AA,1,N339AA,342.0,385,325.0,-51.0,-8.0,JFK,LAX,2475,4.0,13.0,0,,0,0,0,0,0,0,NO,NO


In [6]:
# Make dataframe available to run SQL queries against:
df_spark.createOrReplaceTempView("df_spark")

In [7]:
df_carrier_counts = spark.sql("""SELECT uniquecarrier, count(*) as N
                                 FROM df_spark
                                 GROUP BY uniquecarrier
                                 ORDER BY uniquecarrier""")
df_carrier_counts.show()