In [1]:
import datetime as dt
import pandas as pd
pd.set_option("display.max_columns", None)
from pyspark.sql import SparkSession, SQLContext, GroupedData
from pyspark.sql.functions import *
#from pyspark.sql.functions import isnan, when, count, col

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = "/opt/conda/bin:/opt/spark-2.4.3-bin-hadoop2.7/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/jvm/java-8-openjdk-amd64/bin"
os.environ["SPARK_HOME"] = "/opt/spark-2.4.3-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "/opt/spark-2.4.3-bin-hadoop2.7"

spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

In [2]:
dict_all_df = {
    'inmigration' : {'df'  : None,
                     'file':'./model/facts_inmigration.parquet'},
    'temperature' : {'df'  : None,
                     'file': './model/temperature.parquet'},
    'modes'       : {'df'  : None,
                     'file': './model/mode.parquet'},
    'countries'   : {'df'  : None,
                     'file': './model/countrie.parquet'},
    'states'      : {'df'  : None,
                     'file': './model/state.parquet'},
    'visa'        : {'df'  : None,
                     'file': './model/visa.parquet'},
    'airports'    : {'df'  : None,
                     'file': './model/airport.parquet'},
    'airport_data': {'df'  : None,
                     'file': './model/airport_data.parquet'},
    'cities'      : {'df'  : None,
                     'file': './model/city.parquet'},
}

In [3]:
# Import all parquet files
for name, _dict in dict_all_df.items():
    print(f'Reading {name} Table')
    _path = _dict['file']
    df = spark.read.parquet(_path)
    _dict['df'] = df

Reading inmigration Table
Reading temperature Table
Reading modes Table
Reading countries Table
Reading states Table
Reading visa Table
Reading airports Table
Reading airport_data Table
Reading cities Table


### Example QUERY 1: 
 - select all inmigration data from a specific arrival day

In [4]:
# Register the DataFrame as a SQL temporary view
df = dict_all_df['inmigration']['df']
df.createOrReplaceTempView("inmigration")

sql_query_0 = """
    SELECT * 
    FROM inmigration 
    """
result_0 = spark.sql(sql_query_0)
print(f'FACT table has {result_0.count()} rows')

sql_query_1 = """
    SELECT * 
    FROM inmigration as in
    WHERE in.arrival_year = 2016 and in.arrival_month = 4 and in.arrival_day = 1"""
result_1 = spark.sql(sql_query_1)
print(f'FACT table has {result_1.count()} rows from 2016-04-01')

FACT table has 1141596 rows
FACT table has 42143 rows from 2016-04-01


### Example QUERY 2:
 - select all inmigration data from a specific airport 

In [5]:
# Register the DataFrame as a SQL temporary view
df_0 = dict_all_df['inmigration']['df']
df_0.createOrReplaceTempView("inmigration")
df_1 = dict_all_df['airports']['df']
df_1.createOrReplaceTempView("airports")

sql_query = """
    SELECT * 
    FROM inmigration as in
        LEFT JOIN airports as air 
        ON in.airport_id = air.airport_id 
    WHERE air.airport_id = 'BOS'
    """
result = spark.sql(sql_query)
print(f'FACT table has {result.count()} rows from "BOS" airport (BOSTON)')

FACT table has 50686 rows from "BOS" airport (BOSTON)


### Example QUERY 3: 
 - select all inmigration data from people who were born in BRAZIL and crossed the LAND border

In [6]:
# Register the DataFrame as a SQL temporary view
df_0 = dict_all_df['inmigration']['df']
df_0.createOrReplaceTempView("inmigration")
df_1 = dict_all_df['modes']['df']
df_1.createOrReplaceTempView("modes")
df_2 = dict_all_df['countries']['df']
df_2.createOrReplaceTempView("countries")


sql_query_0 = """
    SELECT * 
    FROM inmigration as in
        LEFT JOIN countries as co 
        ON in.birth_country = co.country_id
        LEFT JOIN modes as mo
        ON in.arrival_mode_id = mo.mode_id
    WHERE mo.mode_name = 'Land'
    """
result_0 = spark.sql(sql_query_0)
print(f'FACT table has {result_0.count()} rows from people who crossed the Land boarder')

sql_query_1 = """
    SELECT * 
    FROM inmigration as in
        LEFT JOIN countries as co 
        ON in.birth_country = co.country_id
        LEFT JOIN modes as mo
        ON in.arrival_mode_id = mo.mode_id
    WHERE co.country_name = 'BRAZIL'
    """
result_1 = spark.sql(sql_query_1)
print(f'FACT table has {result_1.count()} rows from people who were born in BRAZIL')

sql_query_2 = """
    SELECT * 
    FROM inmigration as in
        LEFT JOIN countries as co 
        ON in.birth_country = co.country_id
        LEFT JOIN modes as mo
        ON in.arrival_mode_id = mo.mode_id
    WHERE mo.mode_name = 'Land' and co.country_name = 'BRAZIL'
    """
result_2 = spark.sql(sql_query_2)
print(f'FACT table has {result_2.count()} rows from people who crossed the Land boarder and were born in BRAZIL')

FACT table has 15625 rows from people who crossed the Land boarder
FACT table has 83644 rows from people who were born in BRAZIL
FACT table has 253 rows from people who crossed the Land boarder and were born in BRAZIL
