In [2]:
import pandas as pd
import glob
import os
import csv

def load_split_csv(file_prefix, folder='data'):
    
    search_pattern = os.path.join(folder, f"{file_prefix}*")
    files = sorted(glob.glob(search_pattern))
    
    if not files:
        print(search_pattern)
        return pd.DataFrame()
    
    
    df_list = []
    for f in files:
        try:
            # quoting=csv.QUOTE_NONE 
           
            df = pd.read_csv(
                f, 
                engine='python', 
                quoting=csv.QUOTE_NONE, 
                on_bad_lines='skip', 
                encoding='utf-8'
            )
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {f}: {e}")
        

           
    
    if not df_list:
        return pd.DataFrame()
        
    return pd.concat(df_list, ignore_index=True)

df_movie = load_split_csv('movie_part_', folder='../data')
df_people = load_split_csv('people_temp_part_', folder='../data')
df_tmdb = load_split_csv('tmdb_full_part_', folder='.')

print(f"Movie rows: {len(df_movie)}")
print(f"People rows: {len(df_people)}")
print(f"TMDB rows: {len(df_tmdb)}")

Movie rows: 79088
People rows: 2981595
TMDB rows: 64715


In [3]:


os.listdir('../data')


['movie.csv',
 'movie_part_aa',
 'movie_part_ab',
 'names_temp.csv',
 'people_temp.csv',
 'people_temp_part_aa',
 'people_temp_part_ab']

In [4]:
import gdown

url_data_base_complémentaire = "https://drive.google.com/uc?id=1VB5_gl1fnyBDzcIOXZ5vUSbCY68VZN1v"

gdown.download(
    url_data_base_complémentaire,
    "tmdb_full.csv",
    quiet=False
)

df_tmdb = pd.read_csv("tmdb_full.csv", low_memory=False)


Downloading...
From (original): https://drive.google.com/uc?id=1VB5_gl1fnyBDzcIOXZ5vUSbCY68VZN1v
From (redirected): https://drive.google.com/uc?id=1VB5_gl1fnyBDzcIOXZ5vUSbCY68VZN1v&confirm=t&uuid=aef69eff-8b62-495e-aa91-16951cfd3cff
To: c:\Users\Zilya\Git\recommandation-films-creuse\notebooks\tmdb_full.csv
100%|██████████| 157M/157M [00:40<00:00, 3.83MB/s] 


In [49]:
df_tmdb.shape


(309572, 25)

In [5]:
df_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309572 entries, 0 to 309571
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   adult                         309572 non-null  bool   
 1   backdrop_path                 151760 non-null  object 
 2   budget                        309572 non-null  int64  
 3   genres                        309572 non-null  object 
 4   homepage                      44262 non-null   object 
 5   id                            309572 non-null  int64  
 6   imdb_id                       309572 non-null  object 
 7   original_language             309572 non-null  object 
 8   original_title                309572 non-null  object 
 9   overview                      282512 non-null  object 
 10  popularity                    309572 non-null  float64
 11  poster_path                   264159 non-null  object 
 12  production_countries          309572 non-nul

In [51]:
df_tmdb['genres'].head()

0                        ['Comedy']
1                     ['Adventure']
2              ['Drama', 'Romance']
3      ['Drama', 'Comedy', 'Crime']
4    ['Drama', 'Comedy', 'Romance']
Name: genres, dtype: object

In [52]:
df_tmdb = df_tmdb[df_tmdb['genres'].notna()] # Filtrer les lignes où 'genres' n'est pas NaN
df_tmdb['genres'].head()


0                        ['Comedy']
1                     ['Adventure']
2              ['Drama', 'Romance']
3      ['Drama', 'Comedy', 'Crime']
4    ['Drama', 'Comedy', 'Romance']
Name: genres, dtype: object

In [53]:
df_tmdb['overview'].isna().sum()
#df_tmdb = df_tmdb[df_tmdb['overview'].notna()]  # Filtrer les lignes où 'overview' n'est pas NaN

np.int64(27060)

In [54]:
df_tmdb['poster_path'].isna().sum()


np.int64(45413)

In [55]:
df_tmdb = df_tmdb[df_tmdb['poster_path'].notna()]
df_tmdb.shape


(264159, 25)

In [6]:
'vote_average' in df_tmdb.columns
df_tmdb = df_tmdb[df_tmdb['vote_average'] >= 5.0]
df_tmdb.shape

(169102, 25)

In [9]:
df_tmdb['year'] = pd.to_datetime(
    df_tmdb['release_date'],
    errors='coerce'
).dt.year
df_tmdb['year'].head()

0    1938.0
2    1962.0
3    1988.0
4    1986.0
5    1995.0
Name: year, dtype: float64

In [10]:
df_tmdb_year_pop = df_tmdb[['imdb_id', 'year', 'popularity']]


In [None]:
df_ratings = pd.read_csv(
    url_title_ratings,
    sep='\t'
)

df_ratings.head()
df_final = df_movie.merge(
    df_ratings,
    on='imdb_id',
    how='left'
)

In [14]:
df_final = df_tmdb[
    [
        'title',
        'genres',
        'overview',
        'vote_average',
        'runtime',
        'poster_path',
        'imdb_id'
    ]
].copy()

df_final = df_final.rename(columns={
    'vote_average': 'rating'
})
df_final = df_final.merge(
    df_tmdb_year_pop,
    on='imdb_id',
    how='left'
)

df_final.head()


Unnamed: 0,title,genres,overview,rating,runtime,poster_path,imdb_id,year,popularity
0,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,tt0029927,1938.0,2.852
1,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,tt0055747,1962.0,3.77
2,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,tt0094675,1988.0,9.214
3,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,tt0092149,1986.0,6.282
4,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,tt0113101,1995.0,18.734


In [16]:
url_title_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
df_ratings = pd.read_csv(
    url_title_ratings,
    sep='\t'
)

df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2188
1,tt0000002,5.5,308
2,tt0000003,6.5,2289
3,tt0000004,5.1,196
4,tt0000005,6.2,3020


In [18]:
df_final = df_final.merge(
    df_ratings.rename(columns={'tconst': 'imdb_id'}),
    on='imdb_id',
    how='left'
)
df_final.head()

Unnamed: 0,title,genres,overview,rating,runtime,poster_path,imdb_id,year,popularity,numVotes
0,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,tt0029927,1938.0,2.852,927.0
1,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,tt0055747,1962.0,3.77,2556.0
2,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,tt0094675,1988.0,9.214,9585.0
3,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,tt0092149,1986.0,6.282,8422.0
4,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,tt0113101,1995.0,18.734,116168.0


In [17]:
df_ratings = df_ratings[['tconst', 'numVotes']]
df_ratings.head()

Unnamed: 0,tconst,numVotes
0,tt0000001,2188
1,tt0000002,308
2,tt0000003,2289
3,tt0000004,196
4,tt0000005,3020


In [19]:
df_final.isna().sum()


title              0
genres             0
overview        9384
rating             0
runtime            0
poster_path    12416
imdb_id            0
year             572
popularity         0
numVotes        4964
dtype: int64

In [20]:
df_final = df_final.dropna(subset=['year', 'numVotes'])


In [22]:
df_final = df_final[df_final['year'] >= 1960]
df_final['overview'] = df_final['overview'].fillna('')
df_final['poster_path'] = df_final['poster_path'].fillna('')
df_final.head()


Unnamed: 0,title,genres,overview,rating,runtime,poster_path,imdb_id,year,popularity,numVotes
1,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,tt0055747,1962.0,3.77,2556.0
2,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,tt0094675,1988.0,9.214,9585.0
3,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,tt0092149,1986.0,6.282,8422.0
4,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,tt0113101,1995.0,18.734,116168.0
5,Judgment Night,"['Action', 'Crime', 'Thriller']","While racing to a boxing match, Frank, Mike, J...",6.6,109,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,tt0107286,1993.0,10.797,20751.0


In [23]:
df_final.isna().sum()


title          0
genres         0
overview       0
rating         0
runtime        0
poster_path    0
imdb_id        0
year           0
popularity     0
numVotes       0
dtype: int64

In [24]:
df_final.to_csv('../data/movie.csv', index=False)
df_tmdb = pd.read_csv("tmdb_full.csv", low_memory=False)

In [25]:
import pandas as pd

df_check = pd.read_csv('../data/movie.csv')
df_check.head()


Unnamed: 0,title,genres,overview,rating,runtime,poster_path,imdb_id,year,popularity,numVotes
0,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,tt0055747,1962.0,3.77,2556.0
1,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,tt0094675,1988.0,9.214,9585.0
2,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,tt0092149,1986.0,6.282,8422.0
3,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,tt0113101,1995.0,18.734,116168.0
4,Judgment Night,"['Action', 'Crime', 'Thriller']","While racing to a boxing match, Frank, Mike, J...",6.6,109,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,tt0107286,1993.0,10.797,20751.0


In [26]:
df_check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142087 entries, 0 to 142086
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   title        142087 non-null  object 
 1   genres       142087 non-null  object 
 2   overview     134964 non-null  object 
 3   rating       142087 non-null  float64
 4   runtime      142087 non-null  int64  
 5   poster_path  132945 non-null  object 
 6   imdb_id      142087 non-null  object 
 7   year         142087 non-null  float64
 8   popularity   142087 non-null  float64
 9   numVotes     142087 non-null  float64
dtypes: float64(4), int64(1), object(5)
memory usage: 10.8+ MB


In [27]:
#Actors, producer, runtime are missing in the final dataset
import pandas as pd
# Load title.principals.tsv.gz in chunks
#En Python, le découpage de listes en morceaux (chunks) peut être réalisé via des fonctions personnalisées ou des outils comme itertools.zip_longest. 
# La fonction chunks permet de diviser une liste en sous-listes de taille fixe, utile pour traiter des données par groupes. 
# Cette technique est également appliquée dans des frameworks comme LangChain pour le traitement de texte, 
# où un grand document est segmenté en morceaux plus gérables pour l'analyse ou l'embedding.

url_principals = "https://datasets.imdbws.com/title.principals.tsv.gz"

chunks = pd.read_csv(
    url_principals,
    sep='\t',
    compression='gzip',
    chunksize=500_000
)
#df_principals = pd.concat(chunk[chunk['titleId'].isin(df_check['imdb_id'])] for chunk in chunks)
#tconst (ID film)
#nconst (ID people)


In [28]:
# Get the first chunk
first_chunk = next(chunks)
first_chunk.head()


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0005690,producer,producer,\N
3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N


In [29]:
# Filter for actors, actresses, producers, directors
for chunk in chunks:
    people_chunk = chunk[
        chunk['category'].isin([
            'actor',
            'actress',
            'producer',
            'director'
        ])
    ]
    print(people_chunk['category'].value_counts())
    break


category
actor       192867
actress      89487
director     30354
producer     25014
Name: count, dtype: int64


In [30]:
# Further filter to only include titles in our tmdb dataset
for chunk in chunks:
    people_chunk = chunk[
        chunk['category'].isin([
            'actor',
            'actress',
            'producer',
            'director'
        ])
    ]

    people_chunk = people_chunk[
        people_chunk['tconst'].isin(df_tmdb['imdb_id'])
    ]

    print(people_chunk.shape)
    break


(227472, 6)


In [31]:
# Reload chunks iterator
chunks = pd.read_csv(
    url_principals,
    sep='\t',
    compression='gzip',
    chunksize=500_000
)




In [32]:
# Write filtered people to CSV
people_file = '../data/people_temp.csv'

import os
# Remove file if it exists
if os.path.exists(people_file):
    os.remove(people_file)

first_write = True

for chunk in chunks:

    people_chunk = chunk[
        chunk['category'].isin([
            'actor',
            'actress',
            'producer',
            'director'
        ])
    ]

    people_chunk = people_chunk[
        people_chunk['tconst'].isin(df_tmdb['imdb_id'])
    ]

    if not people_chunk.empty:
        people_chunk.to_csv(
            people_file,
            mode='w' if first_write else 'a',
            index=False,
            header=first_write
        )
        first_write = False





In [33]:
# Load the filtered people data
df_people = pd.read_csv('../data/people_temp.csv')
df_people.shape


(3000618, 6)

In [34]:
import pandas as pd
# Load the filtered people data

df_people = pd.read_csv('../data/people_temp.csv', usecols=['nconst'])
needed_nconst = set(df_people['nconst'].unique())

len(needed_nconst)


915621

In [35]:
url_names = "https://datasets.imdbws.com/name.basics.tsv.gz"
#Load name.basics.tsv.gz in chunks
names_chunks = pd.read_csv(
    url_names,
    sep='\t',
    compression='gzip',
    chunksize=500_000,
    usecols=['nconst', 'primaryName']
)


In [36]:
names_file = '../data/names_temp.csv'

import os
# Remove file if it exists
if os.path.exists(names_file):
    os.remove(names_file)

first_write = True

for chunk in names_chunks:

    names_chunk = chunk[
        chunk['nconst'].isin(needed_nconst)
    ]

    if not names_chunk.empty:
        names_chunk.to_csv(
            names_file,
            mode='w' if first_write else 'a',
            index=False,
            header=first_write
        )
        first_write = False


In [37]:
df_names = pd.read_csv('../data/names_temp.csv')
df_names.shape


(915602, 2)

In [38]:
# merge names with people: nconst, primaryName
df_people = pd.read_csv('../data/people_temp.csv')

df_people = df_people.merge(
    df_names,
    on='nconst',
    how='left'
)


In [39]:
df_people[['tconst', 'category', 'primaryName']].head()


Unnamed: 0,tconst,category,primaryName
0,tt0000001,director,William K.L. Dickson
1,tt0000001,producer,William K.L. Dickson
2,tt0000002,director,Émile Reynaud
3,tt0000003,director,Émile Reynaud
4,tt0000003,producer,Julien Pappé


In [40]:
df_people_small = df_people[['tconst', 'category', 'primaryName']]
df_people_small.head()


Unnamed: 0,tconst,category,primaryName
0,tt0000001,director,William K.L. Dickson
1,tt0000001,producer,William K.L. Dickson
2,tt0000002,director,Émile Reynaud
3,tt0000003,director,Émile Reynaud
4,tt0000003,producer,Julien Pappé


In [41]:
# Filter only actors and actresses
df_actors = df_people_small[
    (df_people_small['category'] == 'actor') |
    (df_people_small['category'] == 'actress')
]
df_actors.head()


Unnamed: 0,tconst,category,primaryName
7,tt0000005,actor,Charles Kayser
8,tt0000005,actor,John Ott
10,tt0000007,actor,James J. Corbett
11,tt0000007,actor,Peter Courtney
16,tt0000008,actor,Fred Ott


In [42]:
# Group actors by tconst
df_actors_grouped = (
    df_actors
    .dropna(subset=['primaryName'])
    .groupby('tconst')['primaryName']
    .apply(lambda x: ', '.join(pd.unique(x.astype(str))))
    .reset_index()
    .rename(columns={'primaryName': 'actors'})
)


In [43]:
df_actors_grouped.head()

Unnamed: 0,tconst,actors
0,tt0000005,"Charles Kayser, John Ott"
1,tt0000007,"James J. Corbett, Peter Courtney"
2,tt0000008,Fred Ott
3,tt0000011,Grunato
4,tt0000014,"François Clerc, Benoît Duval"


In [44]:
df_actors.shape
#df_people_small.head()

(2277773, 3)

In [45]:
#df_producers 
df_producers = df_people_small[
    (df_people_small['category'] == 'producer')]
df_producers.shape


(396099, 3)

In [46]:
df_producers['primaryName'].head()
# Merge producers with actors

1     William K.L. Dickson
4             Julien Pappé
5            Émile Reynaud
9         Thomas A. Edison
14    William K.L. Dickson
Name: primaryName, dtype: object

In [47]:
df_producers = df_producers.dropna(subset=['primaryName'])
df_producers_grouped = (
    df_producers
    .groupby('tconst')['primaryName']
    .apply(lambda x: ', '.join(pd.unique(x.astype(str))))
    .reset_index()
    .rename(columns={'primaryName': 'producers'})
)

In [48]:
df_producers_grouped.head()


Unnamed: 0,tconst,producers
0,tt0000001,William K.L. Dickson
1,tt0000003,"Julien Pappé, Émile Reynaud"
2,tt0000005,Thomas A. Edison
3,tt0000007,"William K.L. Dickson, Thomas A. Edison"
4,tt0000008,William K.L. Dickson


In [51]:
print(df_tmdb.columns.tolist())

['adult', 'backdrop_path', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'production_companies_name', 'production_companies_country']


In [None]:
# 1. Snachala sozdadim nuzhnye kolonki v df_tmdb, esli ikh net
if 'release_date' in df_tmdb.columns:
    # Preobrazuem v datu i dostayom god
    df_tmdb['year'] = pd.to_datetime(df_tmdb['release_date'], errors='coerce').dt.year

if 'vote_count' in df_tmdb.columns and 'numVotes' not in df_tmdb.columns:
    # Pereimenovyvaem ili kopiruem vote_count v numVotes
    df_tmdb['numVotes'] = df_tmdb['vote_count']

# 2. Teper' vash kod dolzhen srabotat' (ubedites', chto imena v spiske sovpadayut)
cols_to_keep = [
    'imdb_id', 'title', 'genres', 'overview', 'vote_average', 
    'runtime', 'poster_path', 'numVotes', 'year', 'popularity'
]

df_final = df_tmdb[cols_to_keep].copy()

df_final = df_final.rename(columns={'vote_average': 'rating'})

In [50]:
df_final = df_tmdb[[
    'imdb_id',
    'title',
    'genres',
    'overview',
    'vote_average',
    'runtime',
    'poster_path',
    'numVotes',
    'year',
    'popularity'
]].copy()

df_final = df_final.rename(columns={
    'vote_average': 'rating'
})


KeyError: "['numVotes', 'year'] not in index"

In [None]:
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg
1,tt0011436,"Peter Voss, Thief of Millions",['Adventure'],,0.0,420,/6xUbUCvndklbGVYiljHr34NTxSl.jpg
2,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg
3,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
4,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg


In [None]:
#final cleaning
df_final[['rating', 'overview']].isna().sum()


rating          0
overview    27060
dtype: int64

In [89]:
df_final['overview'] = df_final['overview'].fillna('')
df_final['rating'] = df_final['rating'].fillna(0.0)

In [90]:
df_final = df_final[df_final['rating'] >= 1.0]
df_final.shape

(218148, 7)

In [91]:
#reset index
df_final = df_final.reset_index(drop=True)
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg
1,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg
2,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
3,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg
4,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg


In [92]:
df_final[['rating', 'overview']].isna().sum()


rating      0
overview    0
dtype: int64

In [95]:
df_actors_grouped = (
    df_actors
    .groupby('tconst')['primaryName']
    .apply(lambda x: ', '.join(x.astype(str).unique()))
    .reset_index()
    .rename(columns={
        'tconst': 'imdb_id',
        'primaryName': 'actors'
    })
)
df_final = df_final.merge(
    df_actors_grouped,
    on='imdb_id',
    how='left'
)




In [96]:
df_final['actors'] = df_final['actors'].fillna('')
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path,actors
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,"Penny Singleton, Arthur Lake, Larry Simms, Dai..."
1,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,"Jean-Pierre Léaud, Marie-France Pisier, Eleono..."
2,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,"Turo Pajala, Susanna Haavisto, Matti Pellonpää..."
3,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,"Matti Pellonpää, Kati Outinen, Sakari Kuosmane..."
4,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,"Tim Roth, Antonio Banderas, Sammi Davis, Amand..."


In [97]:
#add producers
df_final = df_final.merge(
    df_producers_grouped.rename(columns={'tconst': 'imdb_id'}),
    on='imdb_id',
    how='left'
)
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path,actors,producers
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,"Penny Singleton, Arthur Lake, Larry Simms, Dai...",
1,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,"Jean-Pierre Léaud, Marie-France Pisier, Eleono...",Pierre Roustang
2,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,"Turo Pajala, Susanna Haavisto, Matti Pellonpää...",Aki Kaurismäki
3,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,"Matti Pellonpää, Kati Outinen, Sakari Kuosmane...",Mika Kaurismäki
4,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,"Tim Roth, Antonio Banderas, Sammi Davis, Amand...",Lawrence Bender


In [98]:
df_final['producers'] = df_final['producers'].fillna('')
df_final.isna().sum()


imdb_id            0
title              0
genres             0
overview           0
rating             0
runtime            0
poster_path    17772
actors             0
producers          0
dtype: int64

In [99]:
df_final['poster_path'] = df_final['poster_path'].fillna('')
df_final.isna().sum()

imdb_id        0
title          0
genres         0
overview       0
rating         0
runtime        0
poster_path    0
actors         0
producers      0
dtype: int64

In [100]:
df_final.to_csv('../data/movie.csv', index=False)
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path,actors,producers
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,"Penny Singleton, Arthur Lake, Larry Simms, Dai...",
1,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,"Jean-Pierre Léaud, Marie-France Pisier, Eleono...",Pierre Roustang
2,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,"Turo Pajala, Susanna Haavisto, Matti Pellonpää...",Aki Kaurismäki
3,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,"Matti Pellonpää, Kati Outinen, Sakari Kuosmane...",Mika Kaurismäki
4,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,"Tim Roth, Antonio Banderas, Sammi Davis, Amand...",Lawrence Bender


In [101]:
df_check = pd.read_csv('../data/movie.csv')
df_check.isna().sum()
df_check.shape


(218148, 9)