import sys
sys.path.append("/Users/tamara/Documents/Projects/movie-chatbot")

In [2]:
import pandas as pd
from pathlib import Path

# Movies Titles

In [3]:
import polars as pl

path = Path("/Users/tamara/Documents/Projects/movie-chatbot/data/raw")

movies = pl.read_csv(path / "movies_context_*.csv").drop("")

In [4]:
movies.columns

['Title',
 'Release date',
 'Genre',
 'Runtime',
 'Language',
 'Link',
 'parse_ts',
 'TitleId',
 'SourceYear']

In [5]:
movies.shape

(610, 9)

In [6]:
def fix_shifted_data(movies: pl.DataFrame) -> pl.DataFrame:
    is_shifted = pl.col("Release date").str.replace(r'(\[\d+\])+', '').str.strip_chars().str.strptime(dtype=pl.Datetime, format='%B %d, %Y', strict=False).is_null()
    shifted_data = movies.filter(is_shifted).to_pandas()
    not_shifted_data = movies.filter(~is_shifted).to_pandas()

    shifted_data.iloc[:, 2:-3] = shifted_data.iloc[:, 1:-4]
    shifted_data.iloc[:, 1] = None

    return pl.from_pandas(pd.concat([
        shifted_data,
        not_shifted_data
    ]))


In [34]:
runtime_pattern = "(?:(\d+)\s?h)?(?:\s?(\d+)\s?min)?"

movies_clean = fix_shifted_data(
    movies.filter(pl.col("Title") != "Awaiting release")
).with_columns(
    # pl.col("Title").alias("title"),
    # pl.col("Title").alias("title"),
    
    pl.col("Release date").str.replace(r'(\[\d+\])+', '').str.strip_chars().str.strptime(dtype=pl.Datetime, format='%B %d, %Y', strict=False).alias("release_date"),
    pl.col("parse_ts").str.to_datetime().alias("parse_ts"),
    pl.col("SourceYear").str.replace("since_", "").cast(pl.Int32).alias("year"),
    pl.col('Title').str.replace(r'(\[\d+\])+', '').str.strip_chars(),
    (pl.col("Runtime").str.extract_groups(runtime_pattern).struct["1"].cast(pl.Int32) * 60 +
    pl.col("Runtime").str.extract_groups(runtime_pattern).struct["2"].cast(pl.Int32).fill_null(0)).alias("runtime_min"),
).sort(["year", "TitleId"]).with_columns(
    pl.col("release_date").forward_fill()
).with_columns(
    (pl.col('Title') + pl.lit(" is a ") + pl.col('Genre') + pl.lit(" Netflix movie in ") + pl.col('Language') +
      + pl.lit(" language that was released on ") + pl.col('release_date').dt.strftime("%Y-%m-%d")).alias('text'),
    pl.col("runtime_min").is_null().alias("is_runtime_null"),
    pl.col("runtime_min").fill_null(pl.median("runtime_min")),
)#.select(["Title", "Genre", "Language", "SourceYear", "TitleId", "year", "release_date", "runtime_min", "text", "parse_ts"])

In [35]:
movies_clean

Title,Release date,Genre,Runtime,Language,Link,parse_ts,TitleId,SourceYear,release_date,year,runtime_min,text,is_runtime_null
str,str,str,str,str,str,datetime[μs],i64,str,datetime[μs],i32,f64,str,bool
"""What Happened to Mr. Cha?""","""January 1, 2021""","""Comedy""","""1 h 42 min""","""Korean""","""/wiki/What_Happened_to_Mr._Cha…",2024-10-14 00:00:00,0,"""2021""",2021-01-01 00:00:00,2021,102.0,"""What Happened to Mr. Cha? is a…",false
"""Pieces of a Woman""","""January 7, 2021""","""Drama""","""2 h 6 min""","""English""","""/wiki/Pieces_of_a_Woman""",2024-10-14 00:00:00,1,"""2021""",2021-01-07 00:00:00,2021,126.0,"""Pieces of a Woman is a Drama N…",false
"""Stuck Apart""","""January 8, 2021""","""Drama""","""1 h 36 min""","""Turkish""","""/wiki/Stuck_Apart""",2024-10-14 00:00:00,2,"""2021""",2021-01-08 00:00:00,2021,96.0,"""Stuck Apart is a Drama Netflix…",false
"""The Heartbreak Club""","""January 14, 2021""","""Comedy drama""","""1 h 41 min""","""Indonesian""","""/wiki/The_Heartbreak_Club""",2024-10-14 00:00:00,3,"""2021""",2021-01-14 00:00:00,2021,101.0,"""The Heartbreak Club is a Comed…",false
"""Double Dad""","""January 15, 2021""","""Comedy drama""","""1 h 45 min""","""Portuguese""","""/wiki/Double_Dad""",2024-10-14 00:00:00,4,"""2021""",2021-01-15 00:00:00,2021,105.0,"""Double Dad is a Comedy drama N…",false
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""That Christmas""","""December 4, 2024[21][17]""","""CG animation""","""1 h 32 min""","""English""","""/wiki/That_Christmas""",2024-10-14 00:00:00,124,"""since_2024""",2024-12-04 00:00:00,2024,92.0,"""That Christmas is a CG animati…",false
"""Carry-On""","""December 13, 2024[17]""","""Action thriller""","""1 h 58 min""","""English""","""/wiki/Carry-On""",2024-10-14 00:00:00,125,"""since_2024""",2024-12-13 00:00:00,2024,118.0,"""Carry-On is a Action thriller …",false
"""The Six Triple Eight""","""December 20, 2024[17]""","""Period drama""","""TBA""","""English""","""/wiki/The_Six_Triple_Eight""",2024-10-14 00:00:00,126,"""since_2024""",2024-12-20 00:00:00,2024,105.0,"""The Six Triple Eight is a Peri…",true
"""Back in Action""","""January 17, 2025[17][43]""","""Action comedy""","""TBA""","""English""","""/wiki/Back_in_Action""",2024-10-14 00:00:00,127,"""since_2024""",2025-01-17 00:00:00,2024,105.0,"""Back in Action is a Action com…",true


In [36]:
movies_clean.null_count()

Title,Release date,Genre,Runtime,Language,Link,parse_ts,TitleId,SourceYear,release_date,year,runtime_min,text,is_runtime_null
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,92,0,0,0,155,0,0,0,0,0,0,0,0


In [37]:
movies_clean.filter(pl.col("Title") == "The Platform 2")

Title,Release date,Genre,Runtime,Language,Link,parse_ts,TitleId,SourceYear,release_date,year,runtime_min,text,is_runtime_null
str,str,str,str,str,str,datetime[μs],i64,str,datetime[μs],i32,f64,str,bool
"""The Platform 2""",,"""Science fiction""","""1 h 40 min""","""Spanish""","""/wiki/The_Platform_2""",2024-10-14 00:00:00,99,"""since_2024""",2024-10-04 00:00:00,2024,100.0,"""The Platform 2 is a Science fi…",False


In [38]:
movies_clean.filter(pl.col("is_runtime_null"))

Title,Release date,Genre,Runtime,Language,Link,parse_ts,TitleId,SourceYear,release_date,year,runtime_min,text,is_runtime_null
str,str,str,str,str,str,datetime[μs],i64,str,datetime[μs],i32,f64,str,bool
"""Justice""","""October 16, 2024[4]""","""Crime drama""","""TBA""","""Polish""","""#cite_note-October_2024-2""",2024-10-14 00:00:00,104,"""since_2024""",2024-10-16 00:00:00,2024,105.0,"""Justice is a Crime drama Netfl…",true
"""Happiness Is""","""October 18, 2024""","""Comedy drama""","""TBA""","""English""","""#cite_note-8""",2024-10-14 00:00:00,107,"""since_2024""",2024-10-18 00:00:00,2024,105.0,"""Happiness Is is a Comedy drama…",true
"""The Man Who Loved UFOs""","""October 18, 2024[9]""","""Comedy drama""","""TBA""","""Spanish""","""/wiki/The_Man_Who_Loved_UFOs""",2024-10-14 00:00:00,108,"""since_2024""",2024-10-18 00:00:00,2024,105.0,"""The Man Who Loved UFOs is a Co…",true
"""Do Patti""","""October 25, 2024[13]""","""Mystery thriller""","""TBA""","""Hindi""","""/wiki/Do_Patti_(film)""",2024-10-14 00:00:00,110,"""since_2024""",2024-10-25 00:00:00,2024,105.0,"""Do Patti is a Mystery thriller…",true
"""Hijack '93""","""October 25, 2024""","""Historical thriller""","""TBA""","""English""","""#cite_note-15""",2024-10-14 00:00:00,112,"""since_2024""",2024-10-25 00:00:00,2024,105.0,"""Hijack '93 is a Historical thr…",true
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""The Merry Gentlemen""","""November 20, 2024[17]""","""Romantic comedy""","""TBA""","""English""","""/wiki/The_Merry_Gentlemen""",2024-10-14 00:00:00,119,"""since_2024""",2024-11-20 00:00:00,2024,105.0,"""The Merry Gentlemen is a Roman…",true
"""Our Little Secret""","""November 27, 2024[17]""","""Romantic comedy""","""TBA""","""English""","""/wiki/Our_Little_Secret_(film)""",2024-10-14 00:00:00,123,"""since_2024""",2024-11-27 00:00:00,2024,105.0,"""Our Little Secret is a Romanti…",true
"""The Six Triple Eight""","""December 20, 2024[17]""","""Period drama""","""TBA""","""English""","""/wiki/The_Six_Triple_Eight""",2024-10-14 00:00:00,126,"""since_2024""",2024-12-20 00:00:00,2024,105.0,"""The Six Triple Eight is a Peri…",true
"""Back in Action""","""January 17, 2025[17][43]""","""Action comedy""","""TBA""","""English""","""/wiki/Back_in_Action""",2024-10-14 00:00:00,127,"""since_2024""",2025-01-17 00:00:00,2024,105.0,"""Back in Action is a Action com…",true


# Movies Data

In [39]:
import json 

movies_data_l = []
meta_cols = ["parse_ts", "TitleId", "SourceYear", "Title"]

years = ["2021", "2022", "2023", "since_2024"]
for year in years:
    with open(path / f"movies_data_{year}.json", "r") as f:
        movies_data = json.load(f)

    for movie_info in movies_data:
        for topic, topic_text in movie_info.items():
            if topic not in meta_cols:
                movies_data_l.append({
                    "parse_ts": movie_info["parse_ts"],
                    "TitleId": movie_info["TitleId"],
                    "SourceYear": movie_info["SourceYear"],
                    "topic": topic,
                    "text": topic_text
                })


movies_clean_cols = ["TitleId", "SourceYear", "Title", "Genre", "runtime_min", "Language", "release_date"]
movies_data_df = pl.DataFrame(movies_data_l).with_columns(
    pl.col("parse_ts").str.to_datetime(),
).join(movies_clean.select(movies_clean_cols), on=["TitleId", "SourceYear"], how="left")
            

In [116]:
def meta_text():
    return  (pl.col("topic") + " info for movie '" + pl.col("Title") + "' (released on: " + pl.col("release_date").dt.strftime("%Y-%m-%d") +
         ", Genre: " + pl.col("Genre") + ", Language: " + pl.col("Language") + ", Runtime: " + pl.col("runtime_min").cast(pl.Utf8) + " minutes)"
    )

movies_data_df.with_columns(
    (meta_text() + ": " + pl.col("text")).alias("meta_text")
)["meta_text"][0]

"General info for movie 'Pieces of a Woman' (released on: 2021-01-07, Genre: Drama, Language: English, Runtime: 126.0 minutes): Pieces of a Woman is a 2020 drama film directed by KornÃ©l MundruczÃ³ , from a screenplay by Kata WÃ©ber . The film stars Vanessa Kirby , Shia LaBeouf , Molly Parker , Sarah Snook , Iliza Shlesinger , Benny Safdie , Jimmie Fails , and Ellen Burstyn as the family and associates of Martha (Kirby) involved in her traumatic childbirth, baby loss, and a subsequent court case against the midwife, Eva (Parker), whom Martha's mother Elizabeth (Burstyn) blames for the baby's death. Martin Scorsese and Sam Levinson served as executive producers, and the film was scored by Howard Shore . An international co-production of the United States and Canada, the film is partly based on MundruczÃ³ and WÃ©ber's stage play of the same name and explores themes of grief and loss. It premiered on September 4, 2020, at the 77th Venice International Film Festival , where Kirby won the V

In [40]:
movies_data_df

parse_ts,TitleId,SourceYear,topic,text,Title,Genre,runtime_min,Language,release_date
datetime[μs],i64,str,str,str,str,str,f64,str,datetime[μs]
2024-10-14 00:00:00,1,"""2021""","""General""","""Pieces of a Woman is a 2020 dr…","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00
2024-10-14 00:00:00,1,"""2021""","""Plot""","""Martha and Sean, a young Bosto…","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00
2024-10-14 00:00:00,1,"""2021""","""Cast""","""Vanessa Kirby as Martha Weiss …","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00
2024-10-14 00:00:00,1,"""2021""","""Production-Play""","""The play Pieces of a Woman was…","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00
2024-10-14 00:00:00,1,"""2021""","""Production-Development and the…","""The film Pieces of a Woman was…","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00
…,…,…,…,…,…,…,…,…,…
2024-10-14 00:00:00,127,"""since_2024""","""Production""","""In June 2022, it was reported …","""Back in Action""","""Action comedy""",105.0,"""English""",2025-01-17 00:00:00
2024-10-14 00:00:00,127,"""since_2024""","""Release""","""Back in Action is scheduled to…","""Back in Action""","""Action comedy""",105.0,"""English""",2025-01-17 00:00:00
2024-10-14 00:00:00,128,"""since_2024""","""General""","""The Witcher: Sirens of the Dee…","""The Witcher: Sirens of the Dee…","""Animation""",105.0,"""English""",2025-02-11 00:00:00
2024-10-14 00:00:00,128,"""since_2024""","""Cast""","""Doug Cockle as Geralt of Rivia…","""The Witcher: Sirens of the Dee…","""Animation""",105.0,"""English""",2025-02-11 00:00:00


In [41]:
movies_data_df.select(
    pl.col("text").str.len_chars().alias("text_chars"),
    pl.col("text").str.split(" ").list.len().alias("text_words"),
    pl.col("text").str.len_bytes().alias("text_bytes")
).describe()

statistic,text_chars,text_words,text_bytes
str,f64,f64,f64
"""count""",2081.0,2081.0,2081.0
"""null_count""",0.0,0.0,0.0
"""mean""",1011.20519,178.009611,1014.234983
"""std""",1296.089787,225.549487,1298.309312
"""min""",11.0,4.0,11.0
"""25%""",263.0,48.0,265.0
"""50%""",499.0,89.0,501.0
"""75%""",1178.0,206.0,1179.0
"""max""",15584.0,2687.0,15653.0


In [42]:
movies_data_df.to_pandas()["release_date"].astype("int64") // 10**9 

0       1609977
1       1609977
2       1609977
3       1609977
4       1609977
         ...   
2076    1737072
2077    1737072
2078    1739232
2079    1739232
2080    1739232
Name: release_date, Length: 2081, dtype: int64

In [43]:
movies_data_df.with_columns(
    (pl.col("release_date").cast(pl.Int64) // 10**9).alias("release_date_ts"),
)

parse_ts,TitleId,SourceYear,topic,text,Title,Genre,runtime_min,Language,release_date,release_date_ts
datetime[μs],i64,str,str,str,str,str,f64,str,datetime[μs],i64
2024-10-14 00:00:00,1,"""2021""","""General""","""Pieces of a Woman is a 2020 dr…","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00,1609977
2024-10-14 00:00:00,1,"""2021""","""Plot""","""Martha and Sean, a young Bosto…","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00,1609977
2024-10-14 00:00:00,1,"""2021""","""Cast""","""Vanessa Kirby as Martha Weiss …","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00,1609977
2024-10-14 00:00:00,1,"""2021""","""Production-Play""","""The play Pieces of a Woman was…","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00,1609977
2024-10-14 00:00:00,1,"""2021""","""Production-Development and the…","""The film Pieces of a Woman was…","""Pieces of a Woman""","""Drama""",126.0,"""English""",2021-01-07 00:00:00,1609977
…,…,…,…,…,…,…,…,…,…,…
2024-10-14 00:00:00,127,"""since_2024""","""Production""","""In June 2022, it was reported …","""Back in Action""","""Action comedy""",105.0,"""English""",2025-01-17 00:00:00,1737072
2024-10-14 00:00:00,127,"""since_2024""","""Release""","""Back in Action is scheduled to…","""Back in Action""","""Action comedy""",105.0,"""English""",2025-01-17 00:00:00,1737072
2024-10-14 00:00:00,128,"""since_2024""","""General""","""The Witcher: Sirens of the Dee…","""The Witcher: Sirens of the Dee…","""Animation""",105.0,"""English""",2025-02-11 00:00:00,1739232
2024-10-14 00:00:00,128,"""since_2024""","""Cast""","""Doug Cockle as Geralt of Rivia…","""The Witcher: Sirens of the Dee…","""Animation""",105.0,"""English""",2025-02-11 00:00:00,1739232


In [44]:
movies_data_df.null_count()

parse_ts,TitleId,SourceYear,topic,text,Title,Genre,runtime_min,Language,release_date
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0


In [58]:
dlt = pd.Series([pd.Timestamp(2023, 2, 23)]).astype("int64")[0] // 1e9
dlt

# pd.Timestamp(1970, 1, 1) + pd.Timedelta(seconds=dlt)
dlt

1677110400.0

In [69]:
pl.Series("a", [1, 2, 3]).cast(pl.String)

a
str
"""1"""
"""2"""
"""3"""


In [95]:
import datetime as dt

pl.Series("s", [dt.datetime(2023, 2, 23), dt.datetime(2023, 2, 24)]).dt.timestamp("us")[0] // 1e6

1677110400.0

In [99]:
pd.Series([dt.datetime(2023, 2, 23), dt.datetime(2023, 2, 24)]).astype("int64")[0] // 1e9

1677110400.0

In [102]:
import pytz

dt.datetime(2023, 2, 23, tzinfo=pytz.UTC).timestamp()

1677110400.0

In [101]:
pd.Timestamp(2023, 2, 23).timestamp()

1677110400.0

In [45]:
dlt = 1677110400.0

pd.Timestamp(1970, 1, 1) + pd.Timedelta(seconds=dlt)


Timestamp('2023-02-23 00:00:00')