In [36]:
import pandas as pd
import re

file_path = './datasets/movies_embedding.csv'
df = pd.read_csv(file_path) # Add .head(100) if you want to limit the number of rows

print(str(len(df)) + ' rows')

df = df.fillna('')

#============================================
def reformat_date(date_str):
    try:
        if date_str.strip() == '':
            return None
        date_obj = datetime.strptime(date_str, '%m/%d/%y')
        return date_obj.strftime('%m/%d/%y')
    except ValueError:
        return None

# Apply the function to the 'date' column
df['date'] = df['date'].apply(reformat_date)

# Remove rows with NaN dates
df.dropna(subset=['date'], inplace=True)
#============================================

def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))
    else:
        return None

# Apply the function to extract the year from the title column
df['year'] = df['title'].apply(extract_year)

# Filter out rows where the year is missing
df = df.dropna(subset=['year'])

# Update the date column with the extracted year
df['date'] = df.apply(lambda row: re.sub(r'\d{2}/\d{2}/\d{2,4}', f'{row["date"][:6]}{int(row["year"])}', row['date']), axis=1)

# Remove the 'year' column
df = df.drop(columns=['year'])



40179 rows


In [27]:
df.head(5)

Unnamed: 0,imdb_id,title,summary,date,genres,runtime,rating,votes,budget,revenue,language,adult,production,poster_link,metadata,metadata_vector
0,114709,Toy Story (1995),"Led by Woody, Andy's toys live happily in his ...",10/30/1995,"Animation, Adventure, Comedy",81.0,8.3,5415.0,30000000,373554033.0,en,False,Pixar Animation Studios,https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Toy Story (1995)'. ...,"[-0.010617855004966259, 0.027610644698143005, ..."
1,113497,Jumanji (1995),When siblings Judy and Peter discover an encha...,12/15/1995,"Action, Adventure, Family",104.0,6.9,2413.0,65000000,262797249.0,en,False,"TriStar Pictures, Teitler Film, Interscope Com...",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Jumanji (1995)'. Th...,"[-0.004933763761073351, 0.09137537330389023, -..."
2,113228,Grumpier Old Men (1995),A family wedding reignites the ancient feud be...,12/22/1995,"Comedy, Romance",101.0,6.6,92.0,0,0.0,en,False,"Warner Bros., Lancaster Gate",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Grumpier Old Men (1...,"[-0.03661668300628662, 0.10619544982910156, -0..."
3,114885,Waiting to Exhale (1995),"Cheated on, mistreated and stepped on, the wom...",12/22/1995,"Comedy, Drama, Romance",127.0,5.7,34.0,16000000,81452156.0,en,False,Twentieth Century Fox Film Corporation,https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Waiting to Exhale (...,"[-0.02201184816658497, 0.06293337047100067, -0..."
4,113041,Father of the Bride Part II (1995),Just when George Banks has recovered from his ...,02/10/1995,"Comedy, Family, Romance",106.0,5.9,173.0,0,76578911.0,en,False,"Sandollar Productions, Touchstone Pictures",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Father of the Bride...,"[0.011255434714257717, 0.05743606761097908, -0..."


In [37]:
from datetime import datetime, timezone

def convert_to_utc(date_str):
    try:
        date_obj = datetime.strptime(date_str, '%m/%d/%Y')
        utc_date_obj = date_obj.replace(tzinfo=timezone.utc)
        utc_date_str = utc_date_obj.strftime('%Y-%m-%dT%H:%M:%SZ')
        return utc_date_str
    except ValueError:
        print("Error converting date:", date_str)
        return None  # or any other suitable value

# Apply the conversion function to all values in the date column
df['date'] = df['date'].apply(convert_to_utc)

# Filter out rows where date conversion failed
df = df.dropna(subset=['date'])

print(str(len(df)) + ' rows')
df.head(5)

Error converting date: 02/29/2011
Error converting date: 02/29/2013
Error converting date: 02/29/2011
Error converting date: 02/29/2015
39706 rows


Unnamed: 0,imdb_id,title,summary,date,genres,runtime,rating,votes,budget,revenue,language,adult,production,poster_link,metadata,metadata_vector
0,114709,Toy Story (1995),"Led by Woody, Andy's toys live happily in his ...",1995-10-30T00:00:00Z,"Animation, Adventure, Comedy",81.0,8.3,5415.0,30000000,373554033.0,en,False,Pixar Animation Studios,https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Toy Story (1995)'. ...,"[-0.010617855004966259, 0.027610644698143005, ..."
1,113497,Jumanji (1995),When siblings Judy and Peter discover an encha...,1995-12-15T00:00:00Z,"Action, Adventure, Family",104.0,6.9,2413.0,65000000,262797249.0,en,False,"TriStar Pictures, Teitler Film, Interscope Com...",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Jumanji (1995)'. Th...,"[-0.004933763761073351, 0.09137537330389023, -..."
2,113228,Grumpier Old Men (1995),A family wedding reignites the ancient feud be...,1995-12-22T00:00:00Z,"Comedy, Romance",101.0,6.6,92.0,0,0.0,en,False,"Warner Bros., Lancaster Gate",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Grumpier Old Men (1...,"[-0.03661668300628662, 0.10619544982910156, -0..."
3,114885,Waiting to Exhale (1995),"Cheated on, mistreated and stepped on, the wom...",1995-12-22T00:00:00Z,"Comedy, Drama, Romance",127.0,5.7,34.0,16000000,81452156.0,en,False,Twentieth Century Fox Film Corporation,https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Waiting to Exhale (...,"[-0.02201184816658497, 0.06293337047100067, -0..."
4,113041,Father of the Bride Part II (1995),Just when George Banks has recovered from his ...,1995-02-10T00:00:00Z,"Comedy, Family, Romance",106.0,5.9,173.0,0,76578911.0,en,False,"Sandollar Productions, Touchstone Pictures",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Father of the Bride...,"[0.011255434714257717, 0.05743606761097908, -0..."


In [40]:
import json
from datetime import datetime
import locale

def format_time(minutes_float):
    if minutes_float == '':
        return None
    
    minutes_int = int(minutes_float)

    hours = minutes_int // 60
    minutes = minutes_int % 60
    
    if hours > 0:
        time_string = f"{hours}h {minutes}m"
    else:
        time_string = f"{minutes}m"
    
    return time_string

def format_as_dollars(number):
    locale.setlocale(locale.LC_ALL, '')

    number = int(number)
    formatted_number = locale.currency(number, grouping=True)
    formatted_number = formatted_number.replace(locale.localeconv()['currency_symbol'], "$")

    return formatted_number

def get_country_name(abbreviation):
    country_names = {'en': 'English', 'fr': 'French', 'zh': 'Chinese', 'it': 'Italian', 'fa': 'Persian', 'nl': 'Dutch', 'de': 'German', 'cn': 'Chinese', 'ar': 'Arabic', 'es': 'Spanish', 'ru': 'Russian', 'sv': 'Swedish', 'ja': 'Japanese', 'ko': 'Korean', 'sr': 'Serbian', 'bn': 'Bengali', 'he': 'Hebrew', 'pt': 'Portuguese', 'wo': 'Wolof', 'ro': 'Romanian', 'hu': 'Hungarian', 'cy': 'Welsh', 'vi': 'Vietnamese'}
    return country_names.get(abbreviation, '')

def string_to_array(str):
    arr = str.split(',')
    arr = [arr.strip() for arr in arr]

    return arr

# df['runtime'] = df['runtime'].apply(format_time)
# df['budget'] = df['budget'].apply(format_as_dollars)
# df['revenue'] = df['revenue'].apply(format_as_dollars)
df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce').fillna(0).astype(int)
df['budget'] = df['budget'].astype(int)
df['revenue'] = df['revenue'].astype(int)

df['language'] = df['language'].apply(get_country_name)
df['genres'] = df['genres'].apply(string_to_array)
df['production'] = df['production'].apply(string_to_array)
df['votes'] = df['votes'].astype(int)
df.head(5)

Unnamed: 0,imdb_id,title,summary,date,genres,runtime,rating,votes,budget,revenue,language,adult,production,poster_link,metadata,metadata_vector
0,114709,Toy Story (1995),"Led by Woody, Andy's toys live happily in his ...",1995-10-30T00:00:00Z,"[Animation, Adventure, Comedy]",81,8.3,5415,30000000,373554033,English,False,[Pixar Animation Studios],https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Toy Story (1995)'. ...,"[-0.010617855004966259, 0.027610644698143005, ..."
1,113497,Jumanji (1995),When siblings Judy and Peter discover an encha...,1995-12-15T00:00:00Z,"[Action, Adventure, Family]",104,6.9,2413,65000000,262797249,English,False,"[TriStar Pictures, Teitler Film, Interscope Co...",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Jumanji (1995)'. Th...,"[-0.004933763761073351, 0.09137537330389023, -..."
2,113228,Grumpier Old Men (1995),A family wedding reignites the ancient feud be...,1995-12-22T00:00:00Z,"[Comedy, Romance]",101,6.6,92,0,0,English,False,"[Warner Bros., Lancaster Gate]",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Grumpier Old Men (1...,"[-0.03661668300628662, 0.10619544982910156, -0..."
3,114885,Waiting to Exhale (1995),"Cheated on, mistreated and stepped on, the wom...",1995-12-22T00:00:00Z,"[Comedy, Drama, Romance]",127,5.7,34,16000000,81452156,English,False,[Twentieth Century Fox Film Corporation],https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Waiting to Exhale (...,"[-0.02201184816658497, 0.06293337047100067, -0..."
4,113041,Father of the Bride Part II (1995),Just when George Banks has recovered from his ...,1995-02-10T00:00:00Z,"[Comedy, Family, Romance]",106,5.9,173,0,76578911,English,False,"[Sandollar Productions, Touchstone Pictures]",https://images-na.ssl-images-amazon.com/images...,The title of this film is 'Father of the Bride...,"[0.011255434714257717, 0.05743606761097908, -0..."


In [41]:
print(str(len(df)) + ' rows')

39706 rows


In [42]:
# Specify the file name for the CSV
csv_file_name = "movies_with_embedding.csv"

# Export DataFrame to CSV
df.to_csv(csv_file_name, index=False)