In [1]:
import re
import datetime
import pandas as pd
import numpy as np
# from tqdm import tqdm
import os
# import time
import ast

In [2]:
os.chdir("../")

In [3]:
def load_raw_data(file_path: str = None):
    df = pd.read_csv(file_path)
    df['url'] = df['url'].str.replace("(?<=\?)(.*)|(\?)|(title)|(ratings)|(name)|(\/)", "", regex=True)
    # df.rename(columns={'id':'rank'}, inplace=True)

    list_cols = (
        "director",
        "directors",
        "stars",
        "star",
        "top_cast",
        "writers",
        "writer",
        "top_cast_url",
        "director_url",
        "directors_url",
        "writer_url",
        "writers_url",
        "stars_url",
        "star_url",
    )

    url_cols = (        
        "top_cast_url",
        "director_url",
        "directors_url",
        "writer_url",
        "writers_url",
        "stars_url",
        "star_url",
    )
    for col in df.columns:
        if col in list_cols:
            try:
                df[col] = df[col].apply(ast.literal_eval)
            except ValueError:
                df[col] = df[col]
                
        if col in url_cols:
            new_col = []
            for row in df[col]:
                new_col.append([re.sub("(?<=\?)(.*)|(\?)|(name)|(\/)", "", s) for s in row])

            df[col] = new_col
    return df


def _get_stars_url(top_cast, top_cast_url, stars):
    return [v for k, v in zip(top_cast, top_cast_url) if k in stars]


def transform_writers(df):
    writers_df = pd.DataFrame(
        {
            "name": [item for sublist in list(df["writers"]) for item in sublist],
            "url": [item for sublist in list(df["writers_url"]) for item in sublist],
        }
    )
    writers_df["url"] = writers_df["url"].str.replace(
        "(?<=\?)(.*)|(\?)", "", regex=True
    )
    writers_df.drop_duplicates(inplace=True)

    return writers_df


def transform_directors(df):
    directors_df = pd.DataFrame(
        {
            "name": [item for sublist in list(df["directors"]) for item in sublist],
            "url": [item for sublist in list(df["directors_url"]) for item in sublist],
        }
    )
    directors_df["url"] = directors_df["url"].str.replace(
        "(?<=\?)(.*)|(\?)", "", regex=True
    )
    directors_df.drop_duplicates(inplace=True)

    return directors_df


def transform_actors(df):
    actors_df = pd.DataFrame(
        {
            "name": [item for sublist in list(df["top_cast"]) for item in sublist],
            "url": [item for sublist in list(df["top_cast_url"]) for item in sublist],
        }
    )
    actors_df["url"] = actors_df["url"].str.replace("(?<=\?)(.*)|(\?)", "", regex=True)
    actors_df.drop_duplicates(inplace=True)

    return actors_df


def transform_stars(df):

    df["stars_url"] = df.apply(
        lambda x: _get_stars_url(x.top_cast, x.top_cast_url, x.stars), axis=1
    )

    stars_df = pd.DataFrame(
        {
            "name": [item for sublist in list(df["stars"]) for item in sublist],
            "url": [item for sublist in list(df["stars_url"]) for item in sublist],
        }
    )
    stars_df["url"] = stars_df["url"].str.replace("(?<=\?)(.*)|(\?)", "", regex=True)
    stars_df.drop_duplicates(inplace=True)

    return stars_df


def transform_actors_lookup(df):

    top_cast_df = df[["url", "top_cast_url"]].explode(["top_cast_url"])
    top_cast_df.columns = ["movie_url", "actor_url"]
    stars_df = df[["url", "stars_url"]].explode(["stars_url"])
    stars_df.columns = ["movie_url", "actor_url"]
    stars_df["is_star"] = True
    actor_movie_df = top_cast_df.merge(
        stars_df, how="left", on=["movie_url", "actor_url"]
    )
    actor_movie_df["is_star"] = actor_movie_df["is_star"].fillna(False)

    return actor_movie_df


def transform_directors_lookup(df):
    director_movie_df = df[["url", "directors_url"]].explode(["directors_url"])
    director_movie_df.columns = ["movie_url", "director_url"]
    return director_movie_df


def transform_writers_lookup(df):
    writers_movie_df = df[["url", "writers_url"]].explode(["writers_url"])
    writers_movie_df.columns = ["movie_url", "writer_url"]
    return writers_movie_df


def transform_genres_lookup(df):
    genres_movie_df = df[["url", "genres"]].explode(["genres"])
    genres_movie_df.columns = ["movie_url", "genre"]
    return genres_movie_df

def transform_country_lookup(df):
    country_movie_df = df[["url", "country"]].explode(["country"])
    country_movie_df.columns = ["movie_url", "country"]
    return country_movie_df


def transform_language_lookup(df):
    language_movie_df = df[["url", "language"]].explode(["language"])
    language_movie_df.columns = ["movie_url", "language"]
    return language_movie_df


def transform_movies(df):

    return df[
        [
            "rank",
            "url",
            "name",
            "popularity",
            "rating",
            "rating_count",
            "user_review_count",
            "critic_review_count",
            "budget",
            "revenue_usa",
            "revenue_usa_opening",
            "revenue_world",
            "runtime",
            "opening_date",
            "release_date",
        ]
    ]

def _check_role(person:str, role:list):
    if person.strip() in [star.strip() for star in role]:
        return True
    else:
        return False

In [4]:
df = load_raw_data("./cache/movies.csv")

In [5]:
rating_dist = load_raw_data('./cache/rating_dist.csv')
rating_demo = load_raw_data('./cache/rating_demo.csv')

In [6]:
writers = transform_writers(df)
directors = transform_directors(df)
actors = transform_actors(df)
stars = transform_stars(df)
actor_movie = transform_actors_lookup(df)
director_movie = transform_directors_lookup(df)
writer_movie = transform_writers_lookup(df)
genre_movie = transform_genres_lookup(df)
country_movie = transform_country_lookup(df)
language_movie = transform_language_lookup(df)
movies_final = transform_movies(df)


In [7]:
actor_filmo = load_raw_data('./cache/actor_filmo.csv')
director_filmo = load_raw_data('./cache/director_filmo.csv')
writer_filmo = load_raw_data('./cache/writer_filmo.csv')

In [8]:
actor_filmo['actor_name'] = pd.merge(actor_filmo[['url']], actors, how='left', on = 'url')['name']

actor_filmo['is_star'] = actor_filmo.apply(lambda x: _check_role(x.actor_name, x.stars), axis=1)
# director_filmo['is_star'] = False
# writer_filmo['is_star'] = False

cols = ['name','year','cert','runtime','genres','rating','rating_count','revenue_world','url']

actor_filmo = actor_filmo[cols + ['is_star']]
director_filmo = director_filmo[cols]
writer_filmo = writer_filmo[cols]
# filmo = pd.concat(
#     [
#         actor_filmo[cols], director_filmo[cols], writer_filmo[cols]
#     ], axis=0
# )
# filmo.rename(columns={'url':'crew_url'}, inplace=True)

In [9]:
actor_filmo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16805 entries, 0 to 16804
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           16805 non-null  object 
 1   year           16656 non-null  object 
 2   cert           7479 non-null   object 
 3   runtime        16380 non-null  float64
 4   genres         16687 non-null  object 
 5   rating         16307 non-null  float64
 6   rating_count   16307 non-null  float64
 7   revenue_world  8549 non-null   float64
 8   url            16805 non-null  object 
 9   is_star        16805 non-null  bool   
dtypes: bool(1), float64(4), object(5)
memory usage: 1.2+ MB
