In [1]:
import os

from pyspark.sql import SparkSession, SQLContext, GroupedData
from pyspark.sql.functions import *
#from pyspark.sql.functions import col, unix_timestamp, round
from datetime import datetime
import pandas as pd

from pyspark.sql.types import DoubleType

from etl_clean_create_dim_tables import Operators


In [2]:
#os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['AWS_ACCESS_KEY_ID']
#os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
'''
#Build spark session
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [5]:
#Build spark session
spark = SparkSession.builder.getOrCreate()

In [6]:
spark.sparkContext.setLogLevel("INFO")

In [7]:
# path to s3 bucket
#input_data = "s3://helsinki-bikes/"
#output_data = "s3://helsinki-bikes/results/" 

# path to on-premises disk storage
input_data = 'helsinki_bikes/'
output_data = 'helsinki_bikes/results/'


In [8]:
# load and read the dataset
bikes_data = input_data + '/bikes_data/2017_2019/*.csv'
bikes_df = spark.read.csv(bikes_data, inferSchema=True, header=True, sep=',', encoding='utf-8')


In [7]:
bikes_df.count()

9217647

### Clean bikes data and create dimension tables and prepare views  for 10 and 60 min intervals 

In [9]:
# clean and create bike trips from 2017 to 2019 and write parguet files
bikes_from_2017_to_2019 = Operators.df_clean_create_orig(bikes_df, output_data)

In [10]:
# clean and create dim bikes 10 min interval
bikes_df_clean_10min = Operators.df_clean_create_10min(bikes_df, output_data)

In [11]:
# clean and create dim bikes 60 min interval
bikes_df_clean_60min = Operators.df_clean_create_60min(bikes_df, output_data)

In [12]:
# bike trips for every id
bike_count = bikes_from_2017_to_2019.groupby('id').count()

In [13]:
# sum bike trips to 10 min interval
bike_count_10min = bikes_df_clean_10min.groupby('date').count()

In [14]:
# sum bike trips to 60 min interval
bike_count_60min = bikes_df_clean_60min.groupby('date').count()

### Clean weather data and create dimesion tables and prepare views for 10 and 60 min intervals

In [16]:
# load weather dataset
temp_data = input_data + '/weather/2017_2019/*.csv'
temp_df = spark.read.csv(temp_data,header=True, sep=',' ,encoding='utf-8')

In [17]:
temp_df.count()

88116

In [18]:
temp_df_clean_10min = Operators.temp_clean_create_10min(temp_df, output_data)

In [19]:
temp_df_clean_60min = Operators.temp_clean_create_60min(temp_df_clean_10min, output_data)

In [20]:
# count average weather conditions for 60 min
temp_df_clean_60min.createOrReplaceTempView('temp_hour')

In [23]:
temp_60min =spark.sql("""
    SELECT temp_hour.date, 
    avg(temp_hour.air_temp) as air_temp,
    avg(temp_hour.cloud_amount) as cloud_amount,
    avg(temp_hour.pressure) as pressure,
    avg(temp_hour.humidity) as humidity,
    avg(temp_hour.precipitation) as precipitation,
    avg(temp_hour.deW_point_temp) as dev_point_temp,
    avg(temp_hour.visibility) as visibility,
    avg(temp_hour.wind_direc) as wind_direc,
    avg(temp_hour.gust_speed) as gust_speed,
    avg(temp_hour.wind_speed) as wind_speed
    
    
    FROM temp_hour
    GROUP by temp_hour.date

""")

### Clean and create stations dimension table

In [24]:
# load stations dataset
stations_df = input_data + '/stations/2019/*.csv'
stations_df = spark.read.csv(stations_df,header=True, sep=',',encoding='utf-8')

In [38]:
stations_df.count()

351

In [25]:
stations_df_clean = Operators.stations_clean_create(stations_df, output_data)

### Create fact table and machine learning tables and write parguet files

In [26]:
# create views
bikes_from_2017_to_2019.createOrReplaceTempView('bikes_view')
stations_df_clean.createOrReplaceTempView('stations_view') 
bike_count.createOrReplaceTempView('count_view') 


In [27]:

# Create the fact table by joining bikes view, stations view and count view

bikes_fact_table = spark.sql("""
SELECT 
        bikes_view.id,
        bikes_view.year,
        bikes_view.month,
        bikes_view.day,
        bikes_view.weekday,
        bikes_view.hour,
        bikes_view.dep_station_name,
        bikes_view.departure,
        bikes_view.return,
        bikes_view.ret_station_name,
        bikes_view.dep_station_id,
        bikes_view.ret_station_id,
        bikes_view.distance,
        bikes_view.duration,
        stations_view.city,
        stations_view.capacity,
        stations_view.longitude,
        stations_view.latitude,
        count_view.count
    
   

FROM bikes_view
left JOIN stations_view ON bikes_view.dep_station_id = stations_view.station_id
JOIN  count_view ON bikes_view.id = count_view.id

""")

In [34]:
# write bikes_fact_table parquet files
bikes_fact_table.write.mode('overwrite').partitionBy('year').parquet(output_data + 'bikes_fact_2017_2019')

### Machine learning table with 10 min interval

In [28]:
# create bikes view from cleaned 60 min dataframe
bikes_df_clean_10min.createOrReplaceTempView('bikes_10min')

# create temp view from cleaned 60 min dataframe
temp_df_clean_10min.createOrReplaceTempView('temp_10min')

# create count view from bike count 10 min dataframe
bike_count_10min.createOrReplaceTempView('count_10min')



In [29]:
# create machine learning table for 10 min interval
ml_bikes_10min = spark.sql("""
SELECT  distinct
        bikes_10min.date,
        bikes_10min.year,
        bikes_10min.month,
        bikes_10min.day,
        bikes_10min.weekday,
        bikes_10min.hour,
        temp_10min.air_temp,
        temp_10min.humidity,
        temp_10min.wind_speed,
        count_10min.count as bike_count
        
   

FROM bikes_10min
inner JOIN temp_10min ON bikes_10min.date = temp_10min.date
left JOIN count_10min ON bikes_10min.date = count_10min.date 
AND count_10min.date = temp_10min.date

""")

In [35]:
# write ml_bikes_10min parquet files
ml_bikes_10min.write.mode('overwrite').partitionBy('year').parquet(output_data + 'ml_bikes_10min')

### Machine learning table with 60 min interval

In [30]:
# create bikes view from cleaned 60 min dataframe
bikes_df_clean_60min.createOrReplaceTempView('bikes_60min')

# create temp view from cleaned 60 min dataframe
temp_60min.createOrReplaceTempView('temp_60min')

# create count view from bike count 60 min dataframe
bike_count_60min.createOrReplaceTempView('count_60min')


In [31]:
# create machine learning table for 60 min interval
ml_bikes_60min = spark.sql("""
SELECT  distinct
        bikes_60min.date,
        bikes_60min.year,
        bikes_60min.month,
        bikes_60min.day,
        bikes_60min.weekday,
        bikes_60min.hour,
        temp_60min.air_temp,
        temp_60min.humidity,
        temp_60min.wind_speed,
        count_60min.count as bike_count
        
   

FROM bikes_60min
inner JOIN temp_60min ON bikes_60min.date = temp_60min.date
left JOIN count_60min ON bikes_60min.date = count_60min.date 
AND count_60min.date = temp_60min.date

""")

In [36]:
# write ml_bikes_60min parquet files
ml_bikes_60min.write.mode('overwrite').partitionBy('year').parquet(output_data + 'ml_bikes_60min')

#### 4.2 Data Quality Checks

Data quality checks consits of count checks to ensure completeness

After finishing all the checks, data looks good to go.

In [37]:
def quality_check(df, description):
    '''
    Input: Spark dataframes, dimension's and a fact table with descriptions
    
    Output: Outcome of data quality check
    
    '''
    
    result = df.count()
    if result == 0:
        print("Data quality check failed for {} with zero records".format(description))
    else:
        print("Data quality check passed for {} with {} records".format(description, result))
    return 

# Process data quality check
quality_check(bikes_df, "bikes original dataset")
quality_check(bikes_df_clean_10min, "bikes 10min dimension table")
quality_check(bikes_df_clean_60min, "bikes 60min dimension table")
quality_check(temp_df_clean_10min, "temperature 10min dimension table")
quality_check(temp_df_clean_60min, "temperature 60 min dataframe")
quality_check(stations_df_clean, "stations dimension table")
quality_check(bikes_fact_table, "bikes fact table")
quality_check(ml_bikes_10min, "ml 10min table")
quality_check(ml_bikes_60min, "ml 60min table")

Data quality check passed for bikes original dataset with 9217647 records
Data quality check passed for bikes 10min dimension table with 9217647 records
Data quality check passed for bikes 60min dimension table with 9217647 records
Data quality check passed for temperature 10min dimension table with 88116 records
Data quality check passed for temperature 60 min dataframe with 88116 records
Data quality check passed for stations dimension table with 351 records
Data quality check passed for bikes fact table with 9217647 records
Data quality check passed for ml 10min table with 85945 records
Data quality check passed for ml 60min table with 14454 records


In [33]:
# every 10 min slots during time period are togther 88128
# every 60 min slots during time period are togther 14688

In [None]:
spark.stop()