In [1]:
# Do all imports and installs here
import pandas as pd
import os
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from datetime import datetime, timedelta
import json
from pyspark.sql.functions import desc, monotonically_increasing_id, udf, to_date, from_unixtime, trim, col
from custom_udf import *
from etl import *
spark = SparkSession.builder.config(
    "spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0"
).getOrCreate()
input_data = "./data"
output_data = "./data/processed_data/"

:: loading settings :: url = jar:file:/Users/yugesh/opt/anaconda3/envs/airflow/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/yugesh/.ivy2/cache
The jars for the packages stored in: /Users/yugesh/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e52db237-8006-43f0-9682-dc507e05f2d2;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;2.7.0 in central
	found org.apache.hadoop#hadoop-common;2.7.0 in central
	found org.apache.hadoop#hadoop-annotations;2.7.0 in central
	found com.google.guava#guava;11.0.2 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found commons-cli#commons-cli;1.2 in central
	found org.apache.commons#commons-math3;3.1.1 in central
	found xmlenc#xmlenc;0.52 in central
	found commons-httpclient#commons-httpclient;3.1 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.4 in central
	found commons-io#commons-io;2.4 in central
	found commons-net#commons-net;3.1 in central
	found commons-collections#commons-colle

## Transform airports etl

In [2]:
airport_df =  process_airports_data(
        spark=spark, input_data=input_data, output_data=output_data
    )



In [3]:
airport_df.printSchema()

root
 |-- airport_id: long (nullable = false)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)



In [4]:
airport_df.show(10)



+----------+--------------------+------------+-----------+-------------+
|airport_id|                name|        city|      state|         type|
+----------+--------------------+------------+-----------+-------------+
|         0|Perrotti Skyranch...|     Berwick|      MAINE|small_airport|
|         1|Pitcock Rosillos ...|    Marathon|      TEXAS|small_airport|
|         3|Lewis Air Service...|      Leland|MISSISSIPPI|small_airport|
|         5|Copper Basin Airport|      Mackay|      IDAHO|small_airport|
|         6|   Applegate Airport|  Queen City|   MISSOURI|small_airport|
|         7|    Propwash Airport|      Justin|      TEXAS|small_airport|
|        10|Double Tree Farm ...|  Marysville| CALIFORNIA|small_airport|
|        11|      Morrison Field|Bristolville|       OHIO|small_airport|
|        12|   Taylor's Air Park|      Joshua|      TEXAS|small_airport|
|        16|Wiley's Seaplane ...| Lake Oswego|     OREGON|seaplane_base|
+----------+--------------------+------------+-----



In [5]:
airport_df.select("state").show()



+------------+
|       state|
+------------+
|       MAINE|
|       TEXAS|
| MISSISSIPPI|
|       IDAHO|
|    MISSOURI|
|       TEXAS|
|  CALIFORNIA|
|        OHIO|
|       TEXAS|
|      OREGON|
|       TEXAS|
|PENNSYLVANIA|
|      OREGON|
|       TEXAS|
|      ALASKA|
|       TEXAS|
|  CALIFORNIA|
|     FLORIDA|
| N. CAROLINA|
|       TEXAS|
+------------+
only showing top 20 rows



Traceback (most recent call last):
  File "/Users/yugesh/opt/anaconda3/envs/airflow/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/Users/yugesh/opt/anaconda3/envs/airflow/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/Users/yugesh/opt/anaconda3/envs/airflow/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/Users/yugesh/opt/anaconda3/envs/airflow/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError


In [6]:
airport_df.select("type").distinct().show()



+--------------+
|          type|
+--------------+
| seaplane_base|
|medium_airport|
| small_airport|
| large_airport|
+--------------+





In [7]:
airport_df.count()



15148

In [8]:
test = airport_df.dropDuplicates()
test.count()



15148

In [None]:
airport_df = (
    airport_df.withColumn(
        "name",
        remove_whitespace_udf(airport_df.name)
        if airport_df.name.isNotNull
        else airport_df.name,
    )
    .withColumn(
        "city",
        remove_whitespace_udf(airport_df.city)
        if airport_df.city.isNotNull
        else airport_df.city,
    )
    .withColumn(
        "state",
        remove_whitespace_udf(airport_df.state)
        if airport_df.state.isNotNull
        else airport_df.state,
    )
    .withColumn(
        "type",
        remove_whitespace_udf(airport_df.type)
        if airport_df.type.isNotNull
        else airport_df.type,
    )
)

In [None]:
used_airports = ["seaplane_base","medium_airport", "small_airport", "large_airport" ]
test = airport_df.filter(airport_df.type.isin(used_airports))

In [None]:
test.select("type").distinct().show()

In [None]:
test = test.filter( col("type") != "balloonport" or col("type") !=)

In [None]:
test.show()

In [None]:
df.select("type").distinct().show()

## Immigration data etl

In [None]:

df = process_immigration_data(
    spark=spark, input_data=input_data, output_data=output_data
)

In [None]:
df.printSchema()

In [None]:
df.show(10)

In [None]:
# df.write.mode("overwrite").parquet("./data/processed_data")

## demographics data etl

In [None]:
df =  process_cities_demographics(
        spark=spark, input_data=input_data, output_data=output_data
    )

In [None]:
df.printSchema()

In [None]:
# df.write.mode("overwrite").parquet("./data/processed_data/demographic")