# Data Modeling with Apache Cassandra

Jun Zhu

In [1]:
import pandas as pd
import cassandra
from cassandra.cluster import Cluster
import pathlib

cassandra.__version__

'3.24.0'

#### Creating list of filepaths to process original event csv data files

In [2]:
data_dir = pathlib.Path("./event_data")

file_paths = []
for file in data_dir.iterdir():
    file_paths.append(file)

file_paths = sorted(file_paths)

assert 30 == len(file_paths)

#### Processing the files to create the dataframe that will be used for Apache Casssandra tables

In [3]:
dataframes = []

for fp in file_paths:
    dataframes.append(pd.read_csv(fp))

df = pd.concat(dataframes)

assert 8056 == len(df)

In [4]:
df.dropna(inplace=True)

assert 6820 == len(df)

In [5]:
df.drop(columns=['auth', 'method', 'page', 'registration', 'status', 'ts'], inplace=True)

assert 11 == len(df.columns)

df['userId'] = df['userId'].apply(int)

df.to_csv('event_datafile_new.csv')

df

Unnamed: 0,artist,firstName,gender,itemInSession,lastName,length,level,location,sessionId,song,userId
2,Des'ree,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",139,You Gotta Be,8
4,Mr Oizo,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",139,Flat 55,8
5,Tamba Trio,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",139,Quem Quiser Encontrar O Amor,8
6,The Mars Volta,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",139,Eriatarka,8
7,Infected Mushroom,Kaylee,F,6,Summers,440.26730,free,"Phoenix-Mesa-Scottsdale, AZ",139,Becoming Insane,8
...,...,...,...,...,...,...,...,...,...,...,...
382,Foo Fighters,Rylan,M,57,George,271.38567,paid,"Birmingham-Hoover, AL",1076,The Pretender,16
383,Timbiriche,Rylan,M,58,George,202.60526,paid,"Birmingham-Hoover, AL",1076,Besos De Ceniza,16
384,A Perfect Circle,Rylan,M,59,George,206.05342,paid,"Birmingham-Hoover, AL",1076,Rose,16
385,Anberlin,Rylan,M,60,George,348.68200,paid,"Birmingham-Hoover, AL",1076,The Haunting,16


#### Connect to the database

In [6]:
cluster = Cluster(['127.0.0.1'])

session = cluster.connect()

#### Create a new keyspace

In [7]:
# Note: the keyspace name will be automatically converted to lowercase letters.
session.execute("CREATE KEYSPACE IF NOT EXISTS sparkify \
                 WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}")

<cassandra.cluster.ResultSet at 0x7f1abf4a7ad0>

In [8]:
session.set_keyspace('sparkify')

Now we need to create tables to run the following queries. **Remember, with Apache Cassandra you model the database tables on the queries you want to run.**

#### Query 1

Give me the artist, song title and song's length in the music app history that was heard during sessionId = 338, and itemInSession = 4.

In [9]:
# Create the table.
query = "CREATE TABLE IF NOT EXISTS session_song "
query += "(session_id int, item_in_session int, artist_name text, song_title text, song_length float, \
          PRIMARY KEY (session_id, item_in_session))"
session.execute(query)

<cassandra.cluster.ResultSet at 0x7f1abf432b10>

In [10]:
# Insert data.
for _, row in df.iterrows():
    query = "INSERT into session_song (session_id, item_in_session, artist_name, song_title, song_length)"
    query = query + "VALUES (%s, %s, %s, %s, %s)"
    session.execute(query, (row['sessionId'], row['itemInSession'], row['artist'], row['song'], row['length']))

In [11]:
# Check result.
query = "SELECT * FROM session_song WHERE session_id = 338 AND item_in_session = 4"

rows = session.execute(query)
for row in rows:
    print(row.artist_name, row.song_title, row.song_length)

Faithless Music Matters (Mark Knight Dub) 495.30731201171875


#### Query 2

Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182.

It means that `itemInSession` should be the first clustering key.

In [12]:
# Create the table.
query = "CREATE TABLE IF NOT EXISTS session_user_song"
query += "(session_id int, user_id int, item_in_session int, artist_name text, song_title text, \
          user_first_name text, user_last_name text, \
          PRIMARY KEY ((session_id, user_id), item_in_session))"
session.execute(query)

<cassandra.cluster.ResultSet at 0x7f1abf432c90>

In [13]:
# Insert the data.
for _, row in df.iterrows():
    query = "INSERT into session_user_song (session_id, item_in_session, artist_name, song_title, \
             user_first_name, user_last_name, user_id)"
    query += "VALUES (%s, %s, %s, %s, %s, %s, %s)"
    session.execute(query, (row['sessionId'], row['itemInSession'], row['artist'], row['song'], \
                            row['firstName'], row['lastName'], row['userId']))

In [14]:
# Check result.
query = "SELECT * FROM session_user_song WHERE session_id = 182 AND user_id = 10"

rows = session.execute(query)
for row in rows:
    print(row.item_in_session, row.artist_name, row.song_title, row.user_first_name, row.user_last_name)

0 Down To The Bone Keep On Keepin' On Sylvie Cruz
1 Three Drives Greece 2000 Sylvie Cruz
2 Sebastien Tellier Kilometer Sylvie Cruz
3 Lonnie Gordon Catch You Baby (Steve Pitron & Max Sanna Radio Edit) Sylvie Cruz


#### Query 3

Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'.

In [15]:
# Create the table.
query = "CREATE TABLE IF NOT EXISTS user_song "
query += "(song_title text, user_first_name text, user_last_name text, user_id int, \
          PRIMARY KEY (song_title, user_id))"
session.execute(query)

<cassandra.cluster.ResultSet at 0x7f1ac9244110>

In [16]:
# Insert the data.
for _, row in df.iterrows():
    query = "INSERT into user_song (song_title, user_first_name, user_last_name, user_id)"
    query += "VALUES (%s, %s, %s, %s)"
    session.execute(query, (row['song'], row['firstName'], row['lastName'], row['userId']))

In [17]:
# Check result.
query = "SELECT * FROM user_song WHERE song_title = 'All Hands Against His Own'"

rows = session.execute(query)
for row in rows:
    print(row.user_first_name, row.user_last_name)

Jacqueline Lynch
Tegan Levine
Sara Johnson


#### Drop the tables before closing out the sessions

In [18]:
session.execute("DROP TABLE session_song")
session.execute("DROP TABLE session_user_song")
session.execute("DROP TABLE user_song")

<cassandra.cluster.ResultSet at 0x7f1abf4bb910>

#### Close the session and cluster connection¶

In [19]:
session.shutdown()
cluster.shutdown()