In [1]:
import time

import numpy as np
import pandas as pd
import phoenixdb

from tqdm.notebook import tqdm

In [2]:
df_amzn = pd.read_csv("datasets/amazon_prime_titles.csv")
df_amzn["platform"] = "amazon_prime"
df_amzn["show_id"] = df_amzn["show_id"].str.replace("s", "AMZN-")

In [3]:
df_disney = pd.read_csv("datasets/disney_plus_titles.csv")
df_disney["platform"] = "disney"
df_disney["show_id"] = df_disney["show_id"].str.replace("s", "DISNEY-")

In [4]:
df_hulu = pd.read_csv("datasets/hulu_titles.csv")
df_hulu["platform"] = "hulu"
df_hulu["show_id"] = df_hulu["show_id"].str.replace("s", "HULU-")

In [5]:
df_nflx = pd.read_csv("datasets/netflix_titles.csv")
df_nflx["platform"] = "netflix"
df_nflx["show_id"] = df_nflx["show_id"].str.replace("s", "NFLX-")

In [6]:
df = pd.concat([df_amzn, df_disney, df_hulu, df_nflx], ignore_index=True)
df["cast"] = df["cast"].str.split(", ")
df["country"] = df["country"].str.split(", ")
df["listed_in"] = df["listed_in"].str.split(", ")

df = df.explode(["cast"]).explode("country").explode("listed_in")

In [7]:
df = pd.concat([df_amzn, df_disney, df_hulu, df_nflx], ignore_index=True)

df["date_added"] = df["date_added"].str.strip()
df["date_added"] = pd.to_datetime(df["date_added"], format="%B %d, %Y")

df["duration_min"] = df["duration"].str.extract("(\d+) min")

df.replace(np.nan, None, inplace=True)

df["cast"] = df["cast"].str.split(", ")
df["country"] = df["country"].str.split(", ")
df["listed_in"] = df["listed_in"].str.split(", ")

df = df.explode(["cast"]).explode("country").explode("listed_in")

# df.to_csv("datasets/union.csv")

In [8]:
conn = phoenixdb.connect("http://localhost:8765/", autocommit=True)

In [9]:
def create_table():
    query = """
        CREATE TABLE titles (
            id INTEGER PRIMARY KEY,
            show_id VARCHAR,
            type VARCHAR,
            title VARCHAR,
            director VARCHAR,
            actor VARCHAR,
            country VARCHAR,
            date_added DATE,
            release_year SMALLINT,
            rating VARCHAR,
            duration VARCHAR,
            duration_min SMALLINT,
            listed_in VARCHAR,
            description VARCHAR,
            platform VARCHAR
        )
    """

    cursor = conn.cursor()
    cursor.execute(query)

# create_table()

In [10]:
# List tables
cursor = conn.cursor()
cursor.execute("SELECT TABLE_NAME FROM SYSTEM.CATALOG WHERE TABLE_TYPE='u'")
cursor.fetchall()

[['TITLES']]

In [11]:
def test_upsert():
    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)

    query = """
        UPSERT INTO titles
        VALUES (1, 'AMZN-1', 'Movie', 'Title', 'Director', 'Actor', 'MY', '2022-03-01', 2022, '18+', '111 min', 111, 'Comedy', 'Description', 'amazon_prime')
    """
    cursor.execute(query)

    cursor.execute("SELECT * FROM titles")
    print(cursor.fetchall())

# test_upsert()

In [12]:
def test_clean_up():
    cursor = conn.cursor()
    cursor.execute("DELETE FROM titles WHERE id = 1")
    # cursor.execute("DELETE FROM titles")

    cursor.execute("SELECT * FROM titles")
    print(cursor.fetchall())

# test_clean_up()

In [13]:
def load_data(batch_size=1000):
    key = 1
    batches = np.array_split(df, len(df) // batch_size + 1)

    for batch in tqdm(batches):
        tuples_batch = []

        for idx, row in batch.iterrows():
            if pd.isnull(row.date_added):
                date = None
            else:
                date = row.date_added.date()

            if row.duration_min is None:
                duration_min = None
            else:
                duration_min = int(row.duration_min)

            tuple = (
                key, row.show_id, row.type, row.title, row.director, row.cast,
                row.country, date, row.release_year, row.rating, row.duration,
                duration_min, row.listed_in, row.description, row.platform
            )

            tuples_batch.append(tuple)
            key += 1

        cursor = conn.cursor()
        query = "UPSERT INTO titles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
        cursor.executemany(query, tuples_batch)

# load_data()

In [14]:
def movies_and_tv_shows():
    query = """
        SELECT type, COUNT(DISTINCT show_id) AS count
        FROM titles
        GROUP BY type
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = movies_and_tv_shows()
print("Execution time (ns):", time.time_ns() - start)

df_1 = pd.DataFrame(results)
df_1.to_csv("outputs/1.csv")
df_1

Execution time (ns): 1972184600


Unnamed: 0,TYPE,COUNT
0,Movie,16481
1,TV Show,6517


In [15]:
def movies_and_tv_shows_by_platform():
    query = """
        SELECT platform, type, COUNT(DISTINCT show_id) AS count
        FROM titles
        GROUP BY platform, type
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = movies_and_tv_shows_by_platform()
print("Execution time (ns):", time.time_ns() - start)

df_2 = pd.DataFrame(results)
df_2.to_csv("outputs/2.csv")
df_2

Execution time (ns): 3410537700


Unnamed: 0,PLATFORM,TYPE,COUNT
0,amazon_prime,Movie,7814
1,amazon_prime,TV Show,1854
2,disney,Movie,1052
3,disney,TV Show,398
4,hulu,Movie,1484
5,hulu,TV Show,1589
6,netflix,Movie,6131
7,netflix,TV Show,2676


In [16]:
def movies_and_tv_shows_trend():
    query = """
        SELECT type, release_year, COUNT(DISTINCT show_id) AS count
        FROM titles
        GROUP BY type, release_year
        ORDER BY type, release_year
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = movies_and_tv_shows_trend()
print("Execution time (ns):", time.time_ns() - start)

df_3 = pd.DataFrame(results)
df_3.to_csv("outputs/3.csv")
df_3

Execution time (ns): 2754904900


Unnamed: 0,TYPE,RELEASE_YEAR,COUNT
0,Movie,1920,3
1,Movie,1922,2
2,Movie,1923,2
3,Movie,1924,1
4,Movie,1925,8
...,...,...,...
169,TV Show,2017,571
170,TV Show,2018,711
171,TV Show,2019,808
172,TV Show,2020,853


In [17]:
def movies_and_tv_shows_trend_by_platform():
    query = """
        SELECT platform, type, release_year, COUNT(DISTINCT show_id) AS count
        FROM titles
        GROUP BY platform, type, release_year
        ORDER BY platform, type, release_year
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = movies_and_tv_shows_trend_by_platform()
print("Execution time (ns):", time.time_ns() - start)

df_4 = pd.DataFrame(results)
df_4.to_csv("outputs/4.csv")
df_4

Execution time (ns): 3483366800


Unnamed: 0,PLATFORM,TYPE,RELEASE_YEAR,COUNT
0,amazon_prime,Movie,1920,3
1,amazon_prime,Movie,1922,2
2,amazon_prime,Movie,1923,1
3,amazon_prime,Movie,1924,1
4,amazon_prime,Movie,1925,8
...,...,...,...,...
531,netflix,TV Show,2017,265
532,netflix,TV Show,2018,380
533,netflix,TV Show,2019,397
534,netflix,TV Show,2020,436


In [18]:
def movies_and_tv_shows_by_rating():
    query = """
        SELECT rating, COUNT(DISTINCT show_id) AS count
        FROM titles
        WHERE rating IS NOT NULL
            AND rating NOT LIKE '%min'
            AND rating NOT LIKE '%Season'
            AND rating NOT LIKE '%Seasons'
        GROUP BY rating
        ORDER BY count DESC
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = movies_and_tv_shows_by_rating()
print("Execution time (ns):", time.time_ns() - start)

df_5 = pd.DataFrame(results)
df_5.to_csv("outputs/5.csv")
df_5

Execution time (ns): 3686823700


Unnamed: 0,RATING,COUNT
0,TV-MA,3675
1,TV-14,3138
2,R,2154
3,13+,2117
4,TV-PG,1654
5,16+,1547
6,ALL,1268
7,18+,1243
8,PG-13,1112
9,PG,881


In [19]:
def movies_and_tv_shows_by_duration():
    query = """
        SELECT
            CASE
                WHEN duration_min >= 0 AND duration_min <= 30 THEN 'Below 30 minutes'
                WHEN duration_min > 30 AND duration_min <= 60 THEN '30 - 60 minutes'
                WHEN duration_min > 60 AND duration_min <= 120 THEN '1 - 2 hours'
                WHEN duration_min > 120 then 'Above 2 hours'
            END AS duration,
            COUNT(DISTINCT show_id) AS count
        FROM titles
        WHERE duration_min IS NOT NULL
        GROUP BY
            CASE
                WHEN duration_min >= 0 AND duration_min <= 30 THEN 'Below 30 minutes'
                WHEN duration_min > 30 AND duration_min <= 60 THEN '30 - 60 minutes'
                WHEN duration_min > 60 AND duration_min <= 120 THEN '1 - 2 hours'
                WHEN duration_min > 120 then 'Above 2 hours'
            END
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = movies_and_tv_shows_by_duration()
print("Execution time (ns):", time.time_ns() - start)

df_6 = pd.DataFrame(results)
df_6.to_csv("outputs/6.csv")
df_6

Execution time (ns): 3546845400


Unnamed: 0,DURATION,COUNT
0,1 - 2 hours,11318
1,30 - 60 minutes,1468
2,Above 2 hours,2434
3,Below 30 minutes,779


In [20]:
def top_genres_by_platform_and_shows():
    query = """
        SELECT * FROM (
            SELECT platform, listed_in, COUNT(DISTINCT show_id) AS count
            FROM titles
            WHERE platform = 'amazon_prime'
            GROUP BY platform, listed_in
            ORDER BY count DESC
            LIMIT 10
        )
        UNION ALL
        SELECT * FROM (
            SELECT platform, listed_in, COUNT(DISTINCT show_id) AS count
            FROM titles
            WHERE platform = 'disney'
            GROUP BY platform, listed_in
            ORDER BY count DESC
            LIMIT 10
        )
        UNION ALL
        SELECT * FROM (
            SELECT platform, listed_in, COUNT(DISTINCT show_id) AS count
            FROM titles
            WHERE platform = 'hulu'
            GROUP BY platform, listed_in
            ORDER BY count DESC
            LIMIT 10
        )
        UNION ALL
        SELECT * FROM (
            SELECT platform, listed_in, COUNT(DISTINCT show_id) AS count
            FROM titles
            WHERE platform = 'netflix'
            GROUP BY platform, listed_in
            ORDER BY count DESC
            LIMIT 10 
        )
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = top_genres_by_platform_and_shows()
print("Execution time (ns):", time.time_ns() - start)

df_7 = pd.DataFrame(results)
df_7.to_csv("outputs/7.csv")
df_7

Execution time (ns): 15261970600


Unnamed: 0,PLATFORM,LISTED_IN,COUNT
0,amazon_prime,Drama,3687
1,amazon_prime,Comedy,2099
2,amazon_prime,Action,1657
3,amazon_prime,Suspense,1501
4,amazon_prime,Kids,1085
5,amazon_prime,Documentary,993
6,amazon_prime,Special Interest,980
7,amazon_prime,Horror,875
8,amazon_prime,Romance,674
9,amazon_prime,Animation,547


In [21]:
def top_countries_by_shows():
    query = """
        SELECT country, COUNT(DISTINCT show_id) AS count
        FROM titles
        WHERE country IS NOT NULL
        GROUP BY country
        ORDER BY count DESC
        LIMIT 10
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)
    
    return cursor.fetchall()

start = time.time_ns()
results = top_countries_by_shows()
print("Execution time (ns):", time.time_ns() - start)

df_8 = pd.DataFrame(results)
df_8.to_csv("outputs/8.csv")
df_8

Execution time (ns): 2787919100


Unnamed: 0,COUNTRY,COUNT
0,United States,6304
1,India,1299
2,United Kingdom,1172
3,Canada,645
4,Japan,615
5,France,475
6,Germany,293
7,South Korea,260
8,Spain,258
9,Australia,225


In [22]:
def top_directors_by_shows():
    query = """
        SELECT director, COUNT(DISTINCT show_id) AS count
        FROM titles
        WHERE director IS NOT NULL
        GROUP BY director
        ORDER BY count DESC
        LIMIT 10
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = top_directors_by_shows()
print("Execution time (ns):", time.time_ns() - start)

df_9 = pd.DataFrame(results)
df_9.to_csv("outputs/9.csv")
df_9

Execution time (ns): 2583367900


Unnamed: 0,DIRECTOR,COUNT
0,Mark Knight,113
1,Cannis Holder,61
2,Jay Chapman,46
3,Moonbug Entertainment,37
4,Arthur van Merwijk,30
5,Manny Rodriguez,27
6,Jay Karas,22
7,John English,20
8,Rajiv Chilaka,19
9,"Raúl Campos, Jan Suter",18


In [23]:
def top_actors_by_shows():
    query = """
        SELECT actor, COUNT(DISTINCT show_id) AS count
        FROM titles
        WHERE actor IS NOT NULL
        GROUP BY actor
        ORDER BY count DESC
        LIMIT 10
    """

    cursor = conn.cursor(cursor_factory=phoenixdb.cursor.DictCursor)
    cursor.execute(query)

    return cursor.fetchall()

start = time.time_ns()
results = top_actors_by_shows()
print("Execution time (ns):", time.time_ns() - start)

df_10 = pd.DataFrame(results)
df_10.to_csv("outputs/10.csv")
df_10

Execution time (ns): 3313318200


Unnamed: 0,ACTOR,COUNT
0,Anupam Kher,60
1,Maggie Binkley,56
2,Amitabh Bachchan,47
3,Shah Rukh Khan,46
4,Jim Cummings,44
5,Nassar,43
6,Akshay Kumar,41
7,Paresh Rawal,39
8,Naseeruddin Shah,39
9,Danny Trejo,39
