# ETL Pipeline for Pre-Processing the Files

In [1]:
import pandas as pd
import cassandra
import re
import os
import glob
import numpy as np
import json
import csv

#### Creating list of filepaths to process original event csv data files

In [2]:
# checking current working directory
print("Current working directory: {}".format(os.getcwd()))

# Path where csv files are located
filepath = os.getcwd() + '/event_data'

file_path_list = []
# Create a list of filepaths
for root, dirs, files in os.walk(filepath):
    
    # join the file path and root for all csv files using glob to get file_path_list
    csv_file_paths = glob.glob(os.path.join(root,'*.csv'))
    for csv_file_path in csv_file_paths:
        file_path_list.append(csv_file_path)
print(*file_path_list, sep="\n")

Current working directory: /home/workspace
/home/workspace/event_data/2018-11-27-events.csv
/home/workspace/event_data/2018-11-04-events.csv
/home/workspace/event_data/2018-11-07-events.csv
/home/workspace/event_data/2018-11-09-events.csv
/home/workspace/event_data/2018-11-19-events.csv
/home/workspace/event_data/2018-11-05-events.csv
/home/workspace/event_data/2018-11-22-events.csv
/home/workspace/event_data/2018-11-16-events.csv
/home/workspace/event_data/2018-11-26-events.csv
/home/workspace/event_data/2018-11-24-events.csv
/home/workspace/event_data/2018-11-29-events.csv
/home/workspace/event_data/2018-11-15-events.csv
/home/workspace/event_data/2018-11-20-events.csv
/home/workspace/event_data/2018-11-06-events.csv
/home/workspace/event_data/2018-11-18-events.csv
/home/workspace/event_data/2018-11-21-events.csv
/home/workspace/event_data/2018-11-10-events.csv
/home/workspace/event_data/2018-11-23-events.csv
/home/workspace/event_data/2018-11-02-events.csv
/home/workspace/event_data

#### Processing the files to create the data file csv that will be used for Apache Casssandra tables

In [3]:
full_data_rows_list = [] 

for f in file_path_list:
    with open(f, 'r', encoding = 'utf8', newline='') as csvfile: 
        # creating a csv reader object 
        csvreader = csv.reader(csvfile) 
        next(csvreader)
        
        print("Reading from file {}".format(f))
        # extracting each data row one by one and append it        
        for n, line in enumerate(csvreader, 1):
            #print(line)
            full_data_rows_list.append(line)
        print("Done.")
        print("Lines read: {}\n".format(n))

print("\n-----------------------------------------------------------\n")
print("Total lines read: {}".format(len(full_data_rows_list)))
print("\n-----------------------------------------------------------\n")
print("First five lines read:\n")
print(*full_data_rows_list[:5], sep="\n\n")

# creating a event data csv file called event_datafile_new.csv that will be used to insert data into the
# Apache Cassandra tables
csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)

with open('event_datafile_new.csv', 'w', encoding = 'utf8', newline='') as f:
    writer = csv.writer(f, dialect='myDialect')
    writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\
                'level','location','sessionId','song','userId'])
    for row in full_data_rows_list:
        if (row[0] == ''):
            continue
        writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))


Reading from file /home/workspace/event_data/2018-11-27-events.csv
Done.
Lines read: 303

Reading from file /home/workspace/event_data/2018-11-04-events.csv
Done.
Lines read: 189

Reading from file /home/workspace/event_data/2018-11-07-events.csv
Done.
Lines read: 202

Reading from file /home/workspace/event_data/2018-11-09-events.csv
Done.
Lines read: 283

Reading from file /home/workspace/event_data/2018-11-19-events.csv
Done.
Lines read: 327

Reading from file /home/workspace/event_data/2018-11-05-events.csv
Done.
Lines read: 408

Reading from file /home/workspace/event_data/2018-11-22-events.csv
Done.
Lines read: 98

Reading from file /home/workspace/event_data/2018-11-16-events.csv
Done.
Lines read: 382

Reading from file /home/workspace/event_data/2018-11-26-events.csv
Done.
Lines read: 270

Reading from file /home/workspace/event_data/2018-11-24-events.csv
Done.
Lines read: 358

Reading from file /home/workspace/event_data/2018-11-29-events.csv
Done.
Lines read: 366

Reading fro

In [4]:
# check the number of rows in "event_datafile_new.csv"
with open('event_datafile_new.csv', 'r', encoding = 'utf8') as f:
    csvreader = csv.reader(f)
    # skip header row
    next(csvreader)
    total_rows = sum(1 for line in csvreader)
    print("Total rows in 'event_datafile_new.csv' (excluding header row):\n{}".format(total_rows))

Total rows in 'event_datafile_new.csv' (excluding header row):
6820


# Create queries and tables for Apache Cassandra, load data in the tables and execute queries on them 

## Now we are ready to work with the CSV file titled `event_datafile_new.csv`. It contains the following columns: 
- artist 
- firstName of user
- gender of user
- item number in session
- last name of user
- length of the song
- level (paid or free song)
- location of the user
- sessionId
- song title
- userId

The image below is a screenshot of a portion of data in the csv file.

<img src="images/image_event_datafile_new.jpg">

#### Creating a Cluster

In [5]:
# Create a cassandra Cluster object using the Cassandra instance on local machine 
from cassandra.cluster import Cluster
cluster = Cluster(["127.0.0.1"])

# Create a session object to execute query to create keyspace and then asssociate that keyspace
# to this session object using "session.set_keyspace()"
session = cluster.connect()

#### Create Keyspace

In [6]:
session.execute("""
    CREATE KEYSPACE IF NOT EXISTS sparkify
    WITH REPLICATION = {
        'class': 'SimpleStrategy',
        'replication_factor': 1
    }
""")

<cassandra.cluster.ResultSet at 0x7fe11a3d3828>

#### Set Keyspace

In [7]:
session.set_keyspace("sparkify")

## Now we need to create tables to run the following queries. With Apache Cassandra we model the database tables on the queries we want to run.

## Creating queries to ask the following three questions of the data

1. Give me the artist, song title and song's length in the music app history that was heard during  sessionId = 338, and itemInSession  = 4


2. Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182
    

3. Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'




### Working on `query_1`:
- Create query - `query_1`
- Create table `song_info_by_session` acccording to `query_1`
- Run `query_1` on the table `song_info_by_session`

In [8]:
# Query 1:  Give me the artist, song title and song's length in the music app history that was heard during
#           sessionId = 338, and itemInSession = 4
query_1 = """
    SELECT artist, song, length
    FROM song_info_by_session
    WHERE sessionId = 338 AND itemInSession = 4
"""

In [9]:
# Create table "song_info_by_session" to meet the "query_1" requirements
# Here, "sessionId" is partition key and "itemInSession" is clustering key
create_table_query_1 = """
    CREATE TABLE IF NOT EXISTS song_info_by_session (
        sessionId INT,
        itemInSession INT,
        artist TEXT,
        song TEXT,
        length FLOAT,
        PRIMARY KEY (sessionId, itemInSession)
    )
"""
session.execute(create_table_query_1)

<cassandra.cluster.ResultSet at 0x7fe11a3d05f8>

In [10]:
# Insert required columns in table "song_records_1" from all rows in "event_datafile_new.csv"
file = "event_datafile_new.csv"

query = """
    INSERT INTO song_info_by_session (sessionId, itemInSession, artist, song, length)
    VALUES (%s, %s, %s, %s, %s)
"""
with open(file, encoding = "utf8") as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader:
        sessionId = int(line[-3])
        itemInSession = int(line[3])
        artist = str(line[0])
        song = str(line[-2])
        length = float(line[5])
        session.execute(query, (sessionId, itemInSession, artist, song, length))

#### Executing `query_1` to get desired results and also verify rows insertion into the table `song_info_by_session` was successful

In [11]:
rows = session.execute(query_1)
df = pd.DataFrame(list(rows))
df

Unnamed: 0,artist,song,length
0,Faithless,Music Matters (Mark Knight Dub),495.307312


### Working on `query_2`:
- Create query - `query_2`
- Create table `song_info_by_user_session` acccording to `query_2`
- Run `query_2` on the table `song_info_by_user_session`

In [12]:
# Query 2: Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name)
#          for userId = 10, sessionId = 182
query_2 = """
    SELECT artist, song, firstName, lastName
    FROM song_info_by_user_session
    WHERE userId = 10 AND sessionId = 182
"""                   

In [13]:
# Create table "song_info_by_user_session" to meet the "query_2" requirements
# Here, ("userId", "sessionId") is composite partition key and "itemInSession" is clustering key
create_table_query_2 = """
    CREATE TABLE IF NOT EXISTS song_info_by_user_session (
        userId INT,
        sessionId INT,
        itemInSession INT,
        artist TEXT,
        song TEXT,
        firstName TEXT,
        lastName TEXT,
        PRIMARY KEY ((userId, sessionId), itemInSession)
    )
"""
session.execute(create_table_query_2)

<cassandra.cluster.ResultSet at 0x7fe11a38c9e8>

In [14]:
# Insert required columns in table "song_info_by_user_session" from all rows in "event_datafile_new.csv"
file = "event_datafile_new.csv"

query = """
    INSERT INTO song_info_by_user_session (userId, sessionId, itemInSession, artist, song, firstName, lastName)
    VALUES (%s, %s, %s, %s, %s, %s, %s)
"""
with open(file, encoding = "utf8") as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader:
        userId = int(line[-1])
        sessionId = int(line[-3])
        itemInSession = int(line[3])
        artist = str(line[0])
        song = str(line[-2])
        firstName = str(line[1])
        lastName = str(line[4])
        session.execute(query, (userId, sessionId, itemInSession, artist, song, firstName, lastName))

#### Executing `query_2` to get desired results and also verify rows insertion into the table `song_info_by_user_session` was successful

In [15]:
rows = session.execute(query_2)
df = pd.DataFrame(list(rows))
df

Unnamed: 0,artist,song,firstname,lastname
0,Down To The Bone,Keep On Keepin' On,Sylvie,Cruz
1,Three Drives,Greece 2000,Sylvie,Cruz
2,Sebastien Tellier,Kilometer,Sylvie,Cruz
3,Lonnie Gordon,Catch You Baby (Steve Pitron & Max Sanna Radio...,Sylvie,Cruz


### Working on `query_3`:
- Create query - `query_3`
- Create table `username_by_song` acccording to `query_3`
- Run `query_3` on the table `username_by_song`

In [16]:
# Query 3: Give me every user name (first and last) in my music app history who listened
#          to the song 'All Hands Against His Own'
query_3 = """
    SELECT firstName, lastName
    FROM username_by_song
    WHERE song = 'All Hands Against His Own'
"""            

In [17]:
# Create table "song_records_3" to meet the "query_3" requirements
# Here, "song" is partition key and "sessionId", "itemInSession" are clustering keys.
create_table_query_3 = """
    CREATE TABLE IF NOT EXISTS username_by_song (
        song TEXT,
        sessionId INT,
        itemInSession INT,
        firstName TEXT,
        lastName TEXT,
        PRIMARY KEY ((song), sessionId, itemInSession)
    )
"""
session.execute(create_table_query_3)

<cassandra.cluster.ResultSet at 0x7fe11a38cc88>

In [18]:
# Insert required columns in table "username_by_song" from all rows in "event_datafile_new.csv"
file = "event_datafile_new.csv"

query = """
    INSERT INTO username_by_song (song, sessionId, itemInSession, firstName, lastName)
    VALUES (%s, %s, %s, %s, %s)
"""
with open(file, encoding = "utf8") as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader: 
        song = str(line[-2])
        sessionId = int(line[-3])
        itemInSession = int(line[3])
        firstName = str(line[1])
        lastName = str(line[4])
        session.execute(query, (song, sessionId, itemInSession, firstName, lastName))

#### Executing `query_3` to get desired results and also verify rows insertion into the table `user_name_by_song` was successful

In [19]:
rows = session.execute(query_3)
df = pd.DataFrame(list(rows))
df

Unnamed: 0,firstname,lastname
0,Sara,Johnson
1,Jacqueline,Lynch
2,Tegan,Levine


### Drop the tables before closing out the sessions

In [20]:
session.execute("DROP TABLE IF EXISTS song_info_by_session")
session.execute("DROP TABLE IF EXISTS song_info_by_user_session")
session.execute("DROP TABLE IF EXISTS username_by_song")

<cassandra.cluster.ResultSet at 0x7fe11a3b8278>

### Close the session and cluster connectionÂ¶

In [21]:
session.shutdown()
cluster.shutdown()