In [1]:
TOP_K = 5

In [2]:
## Base features
import pandas as pd
mdf = pd.read_csv("../Common/filtered_final_movies_5.tsv", sep="\t")
amdf = pd.read_csv("../Common/additional_movies.tsv", sep="\t")

mdf['actors'] = mdf['actors'].apply(lambda x: ','.join(x.split(',')[:TOP_K]))
amdf['actors'] = amdf['actors'].apply(lambda x: ','.join(x.split(',')[:TOP_K]))

mdl = pd.Series(mdf['actors'].str.split(',').explode().unique()).tolist()

ddf = pd.DataFrame(columns=['nconst'])
ddf['nconst'] = mdl

mmdf = amdf[['tconst', 'actors', 'startYear', 'worldwide','profit']].copy()
mmdf['actors'] = mmdf['actors'].str.split(',')
mmdf = mmdf.explode('actors')
mmdf.reset_index(drop=True, inplace=True)
mmdf.rename(columns={'actors': 'nconst'}, inplace=True)

actor_summary = mmdf.groupby('nconst').agg(
    base_total_gross=('worldwide', 'sum'),
    base_year=('startYear', 'min'),
    base_num_movies=('tconst', 'nunique'),
    base_total_profit = ('profit','sum')
).reset_index()

merged = pd.merge(ddf,actor_summary,how="left")
merged["base_num_movies"] = merged["base_num_movies"].fillna(0).astype(int)
merged["base_total_gross"] = merged["base_total_gross"].fillna(0).astype(int)
merged["base_year"] = merged["base_year"].fillna(-1).astype(int)
merged["base_total_profit"] = merged["base_total_profit"].fillna(0).astype(int)


# Calculate base_nbmovies_revenue correctly
mmdf['has_revenue'] = mmdf['worldwide'] != 0
revenue_counts = mmdf.groupby('nconst')['has_revenue'].sum()
merged['base_nbmovies_revenue'] = merged['nconst'].map(revenue_counts).fillna(0).astype(int)

# Calculate base_nbmovies_profit correctly
mmdf['has_profit'] = ~mmdf['profit'].isna()
profit_counts = mmdf.groupby('nconst')['has_profit'].sum()
merged['base_nbmovies_profit'] = merged['nconst'].map(profit_counts).fillna(0).astype(int)


merged.to_csv("./base.tsv", sep='\t', index=False)

In [3]:
## Features calculation
## Loading base features into a dict
import pandas as pd
mdf = pd.read_csv("../Common/filtered_final_movies_5.tsv", sep='\t')
adf = pd.read_csv("base.tsv", sep='\t')

mdf['actors'] = mdf['actors'].apply(lambda x: ','.join(x.split(',')[:TOP_K]))
amdf['actors'] = amdf['actors'].apply(lambda x: ','.join(x.split(',')[:TOP_K]))

actor_dict = adf.set_index('nconst')[['base_year', 'base_num_movies','base_total_gross','base_total_profit','base_nbmovies_revenue', 'base_nbmovies_profit']] \
                .rename(columns={'base_num_movies': 'curr_num_movies',
                                 'base_total_gross' : 'curr_total_gross',
                                 'base_total_profit': 'curr_total_profit',
                                 'base_nbmovies_revenue': 'curr_nbmovies_revenue',
                                 'base_nbmovies_profit': 'curr_nbmovies_profit'}) \
                .to_dict(orient='index')



## Movie df sorted by release_date
mdf = mdf.sort_values(by='release_date')
mdf['actors_avg_nb_movies'] = 0.0
mdf['actors_avg_tenure'] = 0.0
mdf['actors_avg_total_gross'] = 0.0
mdf['actors_avg_total_profit'] = pd.NA
mdf['actors_avg_avg_gross'] = 0.0
mdf['actors_avg_avg_profit'] = pd.NA


from tqdm import tqdm

for index, row in tqdm(mdf.iterrows()):
    actors = row["actors"].split(',')

    nb = 0
    ten = 0
    total = 0
    total_pr = None
    avg = 0
    avg_pr = None
    
    for actor in actors:
        if actor not in actor_dict.keys():
            actor_dict[actor] = {'base_year': row["startYear"], 'curr_num_movies': 0, 'curr_total_gross':0, 'curr_total_profit':0, 'curr_nbmovies_revenue':0, 'curr_nbmovies_profit':0}
        data = actor_dict.get(actor, {'base_year': row["startYear"], 'curr_num_movies': 0, 'curr_total_gross':0, 'curr_total_profit':0, 'curr_nbmovies_revenue':0, 'curr_nbmovies_profit':0})

        if(data['base_year'] == -1):
            data['base_year'] = row["startYear"]
            actor_dict[actor]["base_year"] = row["startYear"]
        
        nb += data['curr_num_movies']
        ten += (row['startYear'] - data['base_year'])
        if(data['curr_nbmovies_revenue']  != 0):
            total += data['curr_total_gross']
            avg += ( data['curr_total_gross']/ data['curr_nbmovies_revenue'] )
        if(data['curr_nbmovies_profit']  != 0):
            if(total_pr == None):
                total_pr = 0
            if(avg_pr == None):
                avg_pr = 0
            total_pr += ( data['curr_total_profit'])
            avg_pr += ( data['curr_total_profit']/ data['curr_nbmovies_profit'] )
        
        actor_dict[actor]["curr_num_movies"] += 1
        actor_dict[actor]["curr_nbmovies_revenue"] += 1
        actor_dict[actor]["curr_total_gross"] += row['worldwide']
        if(not pd.isna(row['budget'])):
            actor_dict[actor]["curr_total_profit"] += (row['worldwide'] - row['budget'])     
            actor_dict[actor]["curr_nbmovies_profit"] +=1
    
    # Populate new fields using the correct data types
    mdf.at[index, 'actors_avg_nb_movies'] = nb / TOP_K 
    mdf.at[index, 'actors_avg_tenure'] = ten / TOP_K
    mdf.at[index, 'actors_avg_total_gross'] = total / TOP_K
    mdf.at[index, 'actors_avg_avg_gross'] = avg / TOP_K
    if total_pr!=None: 
        mdf.at[index, 'actors_avg_total_profit'] = total_pr/TOP_K
        mdf.at[index, 'actors_avg_avg_profit'] = avg_pr/TOP_K
mdf.to_csv("../Common/filtered_final_movies_5.tsv", sep='\t', index=False)

12718it [00:01, 6883.53it/s]
