## Test your ETL output

#### Neccessary Imports

In [None]:
import configparser
import os
import sys

### Create Spark Session

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .appName("demo")\
                     .getOrCreate()

### Configure AWS credentials

In [None]:
# option 1
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))
os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['KEY']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['SECRET']

In [None]:
# option 2
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))
aws_id = config['AWS']['KEY']
aws_secret = config['AWS']['SECRET']
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_id)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret)
input_data = config['S3']['S3_BUCKET_OUTPUT_PATH']

### Path to read data

In [None]:
input_data = config['S3']['S3_BUCKET_OUTPUT_PATH']

### Read data

In [None]:
songplays_path = os.path.join(input_data, 'songplays')
songplays_df = spark.read.parquet(songplays_path)
songs_path = os.path.join(input_data, 'songs')
songs_df = spark.read.parquet(songs_path)
artists_path = os.path.join(input_data, 'artists')
artists_df = spark.read.parquet(artists_path)

### Create views

In [None]:
songplays_df.createOrReplaceTempView("songplays")
songs_df.createOrReplaceTempView("songs")
artists_df.createOrReplaceTempView("artists")

### Query 1: The most popular songs over the time (considering your schema name is sparkify)

In [None]:
spark.sql('''
          SELECT s.title, count(*) as count
            FROM songplays sp
            INNER JOIN songs s ON s.song_id = sp.song_id
            GROUP BY s.title
            ORDER BY count DESC, s.title ASC
          '''
          ).show(5)

### Query 2: The most popular artists and their songs over the time (considering your schema name is sparkify)

In [None]:
spark.sql('''
          SELECT ar.name, s.title, count(*) as count
            FROM songplays sp
            INNER JOIN songs s ON s.song_id = sp.song_id
            INNER JOIN artists ar ON ar.artist_id = sp.artist_id
            GROUP BY ar.name, s.title
            ORDER BY count DESC, ar.name, s.title ASC;
          ''').show(5)