In [79]:
import pandas as pd
import glob
import os
import csv

def load_split_csv(file_prefix, folder='data'):
    
    search_pattern = os.path.join(folder, f"{file_prefix}*")
    files = sorted(glob.glob(search_pattern))
    
    if not files:
        print(search_pattern)
        return pd.DataFrame()
    
    
    df_list = []
    for f in files:
        try:
            # quoting=csv.QUOTE_NONE 
           
            df = pd.read_csv(
                f, 
                engine='python', 
                quoting=csv.QUOTE_NONE, 
                on_bad_lines='skip', 
                encoding='utf-8'
            )
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {f}: {e}")
        

           
    
    if not df_list:
        return pd.DataFrame()
        
    return pd.concat(df_list, ignore_index=True)

df_movie = load_split_csv('movie_part_', folder='../data')
df_people = load_split_csv('people_temp_part_', folder='../data')
df_tmdb = load_split_csv('tmdb_full_part_', folder='.')

print(f"Movie rows: {len(df_movie)}")
print(f"People rows: {len(df_people)}")
print(f"TMDB rows: {len(df_tmdb)}")

Movie rows: 79088
People rows: 2981595
TMDB rows: 64715


In [83]:


os.listdir('../data')


['df_final_creuse.csv',
 'df_ml_ready.csv',
 'movie.csv',
 'movie_part_aa',
 'movie_part_ab',
 'names_temp.csv',
 'people_temp.csv',
 'people_temp_part_aa',
 'people_temp_part_ab']

In [84]:
import gdown

url_data_base_complémentaire = "https://drive.google.com/uc?id=1VB5_gl1fnyBDzcIOXZ5vUSbCY68VZN1v"

gdown.download(
    url_data_base_complémentaire,
    "tmdb_full.csv",
    quiet=False
)

df_tmdb = pd.read_csv("tmdb_full.csv", low_memory=False)


Downloading...
From (original): https://drive.google.com/uc?id=1VB5_gl1fnyBDzcIOXZ5vUSbCY68VZN1v
From (redirected): https://drive.google.com/uc?id=1VB5_gl1fnyBDzcIOXZ5vUSbCY68VZN1v&confirm=t&uuid=9dc2a6e1-5f0f-485d-a8e6-1f343e9fd693
To: c:\Users\Zilya\Git\recommandation-films-creuse\notebooks\tmdb_full.csv
100%|██████████| 157M/157M [00:39<00:00, 3.97MB/s] 


In [85]:
df_tmdb.shape


(309572, 25)

In [86]:
df_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309572 entries, 0 to 309571
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   adult                         309572 non-null  bool   
 1   backdrop_path                 151760 non-null  object 
 2   budget                        309572 non-null  int64  
 3   genres                        309572 non-null  object 
 4   homepage                      44262 non-null   object 
 5   id                            309572 non-null  int64  
 6   imdb_id                       309572 non-null  object 
 7   original_language             309572 non-null  object 
 8   original_title                309572 non-null  object 
 9   overview                      282512 non-null  object 
 10  popularity                    309572 non-null  float64
 11  poster_path                   264159 non-null  object 
 12  production_countries          309572 non-nul

In [87]:
df_tmdb['genres'].head()

0                        ['Comedy']
1                     ['Adventure']
2              ['Drama', 'Romance']
3      ['Drama', 'Comedy', 'Crime']
4    ['Drama', 'Comedy', 'Romance']
Name: genres, dtype: object

In [88]:
df_tmdb = df_tmdb[df_tmdb['genres'].notna()] # Filtrer les lignes où 'genres' n'est pas NaN
df_tmdb['genres'].head()


0                        ['Comedy']
1                     ['Adventure']
2              ['Drama', 'Romance']
3      ['Drama', 'Comedy', 'Crime']
4    ['Drama', 'Comedy', 'Romance']
Name: genres, dtype: object

In [89]:
df_tmdb['overview'].isna().sum()
#df_tmdb = df_tmdb[df_tmdb['overview'].notna()]  # Filtrer les lignes où 'overview' n'est pas NaN

np.int64(27060)

In [90]:
df_tmdb['poster_path'].isna().sum()


np.int64(45413)

In [91]:
df_tmdb = df_tmdb[df_tmdb['poster_path'].notna()]
df_tmdb.shape


(264159, 25)

In [92]:
'vote_average' in df_tmdb.columns
df_tmdb = df_tmdb[df_tmdb['vote_average'] >= 5.0]
df_tmdb.shape

(156686, 25)

In [93]:
df_tmdb['year'] = pd.to_datetime(
    df_tmdb['release_date'],
    errors='coerce'
).dt.year
df_tmdb['year'].head()

0    1938.0
2    1962.0
3    1988.0
4    1986.0
5    1995.0
Name: year, dtype: float64

In [94]:
df_tmdb_year_pop = df_tmdb[['imdb_id', 'year', 'popularity']]


In [95]:
url_title_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"


df_ratings = pd.read_csv(
    url_title_ratings,
    sep='\t',
    compression='gzip' 
)


df_final = df_movie.merge(
    df_ratings.rename(columns={'tconst': 'imdb_id', 'averageRating': 'rating'}), 
    on='imdb_id',
    how='left'
)


df_final.head()


Unnamed: 0,imdb_id,title,genres,overview,rating_x,runtime,poster_path,actors,producers,Chung,Chien-yu Fan,"Ren-wei Liu""","""Kang-shing Chen","Chin-sheng Yeh""",rating_y,numVotes
0,Larry Simms,Daisy,Ann Doran,Dorothy Moore,Gene Lockhart,Jonathan Hale,Gordon Oliver,"Danny Mummert""",,,,,,,,
1,7.708,80,/x7Sz339F2oC8mBf0DHCQpKizXaL.jpg,,"""Ulrich Gehmacher","Timo Novotny""",,,,,,,,,,
2,Marc Meyer,,,,,,,,,,,,,,,
3,"""Robert August",Michael Hynson,Bruce Brown,Terence Bullen,"Roy Crump""","""Robert Bagley","Bruce Brown""",,,,,,,,,
4,icebound immensities of the South Pole as he ...,over a period of several months,they pursue a mutual sexual passion whose ine...,5.709,69,/91O7z0vo7MiNWd5xD2BoivwbQsb.jpg,"""Kieran O'Brien","Margo Stilley""",Andrew Eaton,,,,,,,


In [96]:
df_final = df_tmdb[
    [
        'title',
        'genres',
        'overview',
        'vote_average',
        'runtime',
        'poster_path',
        'imdb_id'
    ]
].copy()

df_final = df_final.rename(columns={
    'vote_average': 'rating'
})
df_final = df_final.merge(
    df_tmdb_year_pop,
    on='imdb_id',
    how='left'
)

df_final.head()


Unnamed: 0,title,genres,overview,rating,runtime,poster_path,imdb_id,year,popularity
0,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,tt0029927,1938.0,2.852
1,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,tt0055747,1962.0,3.77
2,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,tt0094675,1988.0,9.214
3,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,tt0092149,1986.0,6.282
4,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,tt0113101,1995.0,18.734


In [97]:
url_title_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
df_ratings = pd.read_csv(
    url_title_ratings,
    sep='\t'
)

df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2190
1,tt0000002,5.5,309
2,tt0000003,6.5,2290
3,tt0000004,5.1,196
4,tt0000005,6.2,3023


In [98]:
df_final = df_final.merge(
    df_ratings.rename(columns={'tconst': 'imdb_id'}),
    on='imdb_id',
    how='left'
)
df_final.head()

Unnamed: 0,title,genres,overview,rating,runtime,poster_path,imdb_id,year,popularity,averageRating,numVotes
0,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,tt0029927,1938.0,2.852,6.9,929.0
1,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,tt0055747,1962.0,3.77,7.1,2559.0
2,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,tt0094675,1988.0,9.214,7.4,9601.0
3,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,tt0092149,1986.0,6.282,7.4,8445.0
4,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,tt0113101,1995.0,18.734,6.7,116269.0


In [99]:
df_ratings = df_ratings[['tconst', 'numVotes']]
df_ratings.head()

Unnamed: 0,tconst,numVotes
0,tt0000001,2190
1,tt0000002,309
2,tt0000003,2290
3,tt0000004,196
4,tt0000005,3023


In [100]:
df_final.isna().sum()


title               0
genres              0
overview         7931
rating              0
runtime             0
poster_path         0
imdb_id             0
year              310
popularity          0
averageRating    3744
numVotes         3744
dtype: int64

In [101]:
df_final = df_final.dropna(subset=['year', 'numVotes'])


In [102]:
df_final = df_final[df_final['year'] >= 1960]
df_final['overview'] = df_final['overview'].fillna('')
df_final['poster_path'] = df_final['poster_path'].fillna('')
df_final.head()


Unnamed: 0,title,genres,overview,rating,runtime,poster_path,imdb_id,year,popularity,averageRating,numVotes
1,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,tt0055747,1962.0,3.77,7.1,2559.0
2,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,tt0094675,1988.0,9.214,7.4,9601.0
3,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,tt0092149,1986.0,6.282,7.4,8445.0
4,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,tt0113101,1995.0,18.734,6.7,116269.0
5,Judgment Night,"['Action', 'Crime', 'Thriller']","While racing to a boxing match, Frank, Mike, J...",6.6,109,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,tt0107286,1993.0,10.797,6.6,20804.0


In [103]:
df_final.isna().sum()


title            0
genres           0
overview         0
rating           0
runtime          0
poster_path      0
imdb_id          0
year             0
popularity       0
averageRating    0
numVotes         0
dtype: int64

In [104]:
df_final.to_csv('../data/movie.csv', index=False)
df_tmdb = pd.read_csv("tmdb_full.csv", low_memory=False)

In [105]:
import pandas as pd

df_check = pd.read_csv('../data/movie.csv')
df_check.head()


Unnamed: 0,title,genres,overview,rating,runtime,poster_path,imdb_id,year,popularity,averageRating,numVotes
0,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,tt0055747,1962.0,3.77,7.1,2559.0
1,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,tt0094675,1988.0,9.214,7.4,9601.0
2,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,tt0092149,1986.0,6.282,7.4,8445.0
3,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,tt0113101,1995.0,18.734,6.7,116269.0
4,Judgment Night,"['Action', 'Crime', 'Thriller']","While racing to a boxing match, Frank, Mike, J...",6.6,109,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,tt0107286,1993.0,10.797,6.6,20804.0


In [106]:
df_check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132948 entries, 0 to 132947
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   title          132948 non-null  object 
 1   genres         132948 non-null  object 
 2   overview       126751 non-null  object 
 3   rating         132948 non-null  float64
 4   runtime        132948 non-null  int64  
 5   poster_path    132948 non-null  object 
 6   imdb_id        132948 non-null  object 
 7   year           132948 non-null  float64
 8   popularity     132948 non-null  float64
 9   averageRating  132948 non-null  float64
 10  numVotes       132948 non-null  float64
dtypes: float64(5), int64(1), object(5)
memory usage: 11.2+ MB


In [107]:
#Actors, producer, runtime are missing in the final dataset
import pandas as pd
# Load title.principals.tsv.gz in chunks
#En Python, le découpage de listes en morceaux (chunks) peut être réalisé via des fonctions personnalisées ou des outils comme itertools.zip_longest. 
# La fonction chunks permet de diviser une liste en sous-listes de taille fixe, utile pour traiter des données par groupes. 
# Cette technique est également appliquée dans des frameworks comme LangChain pour le traitement de texte, 
# où un grand document est segmenté en morceaux plus gérables pour l'analyse ou l'embedding.

url_principals = "https://datasets.imdbws.com/title.principals.tsv.gz"

chunks = pd.read_csv(
    url_principals,
    sep='\t',
    compression='gzip',
    chunksize=500_000
)
#df_principals = pd.concat(chunk[chunk['titleId'].isin(df_check['imdb_id'])] for chunk in chunks)
#tconst (ID film)
#nconst (ID people)


In [108]:
# Get the first chunk
first_chunk = next(chunks)
first_chunk.head()


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0005690,producer,producer,\N
3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N


In [109]:
# Filter for actors, actresses, producers, directors
for chunk in chunks:
    people_chunk = chunk[
        chunk['category'].isin([
            'actor',
            'actress',
            'producer',
            'director'
        ])
    ]
    print(people_chunk['category'].value_counts())
    break


category
actor       192880
actress      89468
director     30354
producer     25012
Name: count, dtype: int64


In [110]:
# Further filter to only include titles in our tmdb dataset
for chunk in chunks:
    people_chunk = chunk[
        chunk['category'].isin([
            'actor',
            'actress',
            'producer',
            'director'
        ])
    ]

    people_chunk = people_chunk[
        people_chunk['tconst'].isin(df_tmdb['imdb_id'])
    ]

    print(people_chunk.shape)
    break


(227456, 6)


In [111]:
# Reload chunks iterator
chunks = pd.read_csv(
    url_principals,
    sep='\t',
    compression='gzip',
    chunksize=500_000
)




In [112]:
# Write filtered people to CSV
people_file = '../data/people_temp.csv'

import os
# Remove file if it exists
if os.path.exists(people_file):
    os.remove(people_file)

first_write = True

for chunk in chunks:

    people_chunk = chunk[
        chunk['category'].isin([
            'actor',
            'actress',
            'producer',
            'director'
        ])
    ]

    people_chunk = people_chunk[
        people_chunk['tconst'].isin(df_tmdb['imdb_id'])
    ]

    if not people_chunk.empty:
        people_chunk.to_csv(
            people_file,
            mode='w' if first_write else 'a',
            index=False,
            header=first_write
        )
        first_write = False





In [113]:
# Load the filtered people data
df_people = pd.read_csv('../data/people_temp.csv')
df_people.shape


(3000691, 6)

In [114]:
import pandas as pd
# Load the filtered people data

df_people = pd.read_csv('../data/people_temp.csv', usecols=['nconst'])
needed_nconst = set(df_people['nconst'].unique())

len(needed_nconst)


915715

In [115]:
url_names = "https://datasets.imdbws.com/name.basics.tsv.gz"
#Load name.basics.tsv.gz in chunks
names_chunks = pd.read_csv(
    url_names,
    sep='\t',
    compression='gzip',
    chunksize=500_000,
    usecols=['nconst', 'primaryName']
)


In [116]:
names_file = '../data/names_temp.csv'

import os
# Remove file if it exists
if os.path.exists(names_file):
    os.remove(names_file)

first_write = True

for chunk in names_chunks:

    names_chunk = chunk[
        chunk['nconst'].isin(needed_nconst)
    ]

    if not names_chunk.empty:
        names_chunk.to_csv(
            names_file,
            mode='w' if first_write else 'a',
            index=False,
            header=first_write
        )
        first_write = False


In [117]:
df_names = pd.read_csv('../data/names_temp.csv')
df_names.shape


(915693, 2)

In [118]:
# merge names with people: nconst, primaryName
df_people = pd.read_csv('../data/people_temp.csv')

df_people = df_people.merge(
    df_names,
    on='nconst',
    how='left'
)


In [119]:
df_people[['tconst', 'category', 'primaryName']].head()


Unnamed: 0,tconst,category,primaryName
0,tt0000001,director,William K.L. Dickson
1,tt0000001,producer,William K.L. Dickson
2,tt0000002,director,Émile Reynaud
3,tt0000003,director,Émile Reynaud
4,tt0000003,producer,Julien Pappé


In [120]:
df_people_small = df_people[['tconst', 'category', 'primaryName']]
df_people_small.head()


Unnamed: 0,tconst,category,primaryName
0,tt0000001,director,William K.L. Dickson
1,tt0000001,producer,William K.L. Dickson
2,tt0000002,director,Émile Reynaud
3,tt0000003,director,Émile Reynaud
4,tt0000003,producer,Julien Pappé


In [121]:
# Filter only actors and actresses
df_actors = df_people_small[
    (df_people_small['category'] == 'actor') |
    (df_people_small['category'] == 'actress')
]
df_actors.head()


Unnamed: 0,tconst,category,primaryName
7,tt0000005,actor,Charles Kayser
8,tt0000005,actor,John Ott
10,tt0000007,actor,James J. Corbett
11,tt0000007,actor,Peter Courtney
16,tt0000008,actor,Fred Ott


In [122]:
# Group actors by tconst
df_actors_grouped = (
    df_actors
    .dropna(subset=['primaryName'])
    .groupby('tconst')['primaryName']
    .apply(lambda x: ', '.join(pd.unique(x.astype(str))))
    .reset_index()
    .rename(columns={'primaryName': 'actors'})
)


In [123]:
df_actors_grouped.head()

Unnamed: 0,tconst,actors
0,tt0000005,"Charles Kayser, John Ott"
1,tt0000007,"James J. Corbett, Peter Courtney"
2,tt0000008,Fred Ott
3,tt0000011,Grunato
4,tt0000014,"François Clerc, Benoît Duval"


In [124]:
df_actors.shape
#df_people_small.head()

(2277780, 3)

In [125]:
#df_producers 
df_producers = df_people_small[
    (df_people_small['category'] == 'producer')]
df_producers.shape


(396159, 3)

In [126]:
df_producers['primaryName'].head()
# Merge producers with actors

1     William K.L. Dickson
4             Julien Pappé
5            Émile Reynaud
9         Thomas A. Edison
14    William K.L. Dickson
Name: primaryName, dtype: object

In [127]:
df_producers = df_producers.dropna(subset=['primaryName'])
df_producers_grouped = (
    df_producers
    .groupby('tconst')['primaryName']
    .apply(lambda x: ', '.join(pd.unique(x.astype(str))))
    .reset_index()
    .rename(columns={'primaryName': 'producers'})
)

In [128]:
df_producers_grouped.head()


Unnamed: 0,tconst,producers
0,tt0000001,William K.L. Dickson
1,tt0000003,"Julien Pappé, Émile Reynaud"
2,tt0000005,Thomas A. Edison
3,tt0000007,"William K.L. Dickson, Thomas A. Edison"
4,tt0000008,William K.L. Dickson


In [129]:
print(df_tmdb.columns.tolist())

['adult', 'backdrop_path', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'production_companies_name', 'production_companies_country']


In [130]:

if 'release_date' in df_tmdb.columns:
   
    df_tmdb['year'] = pd.to_datetime(df_tmdb['release_date'], errors='coerce').dt.year

if 'vote_count' in df_tmdb.columns and 'numVotes' not in df_tmdb.columns:
     
    df_tmdb['numVotes'] = df_tmdb['vote_count']


cols_to_keep = [
    'imdb_id', 'title', 'genres', 'overview', 'vote_average', 
    'runtime', 'poster_path', 'numVotes', 'year', 'popularity'
]

df_final = df_tmdb[cols_to_keep].copy()

df_final = df_final.rename(columns={'vote_average': 'rating'})

In [131]:
df_final = df_tmdb[[
    'imdb_id',
    'title',
    'genres',
    'overview',
    'vote_average',
    'runtime',
    'poster_path',
    'popularity'
]].copy()

df_final = df_final.rename(columns={
    'vote_average': 'rating'
})


In [132]:
df_final.columns
df_final.shape



(309572, 8)

In [133]:
df_final['year'] = pd.to_datetime(
    df_tmdb['release_date'],
    errors='coerce'
).dt.year


In [134]:
df_final['year'].isna().sum()


np.int64(8233)

In [135]:
import pandas as pd

url_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"

df_ratings = pd.read_csv(
    url_ratings,
    sep='\t',
    compression='gzip'
)

df_ratings.head()


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2190
1,tt0000002,5.5,309
2,tt0000003,6.5,2290
3,tt0000004,5.1,196
4,tt0000005,6.2,3023


In [136]:
df_final = df_final.merge(
    df_ratings[['tconst', 'numVotes']],
    left_on='imdb_id',
    right_on='tconst',
    how='left'
)

df_final = df_final.drop(columns='tconst')


In [137]:
df_final[['rating', 'numVotes']].head()
df_final['numVotes'].isna().sum()


np.int64(35765)

In [138]:
df_final = df_final[df_final['numVotes'] >= 50]
df_final.shape


(198165, 10)

In [139]:
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path,popularity,year,numVotes
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,2.852,1938.0,929.0
2,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,3.77,1962.0,2559.0
3,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,9.214,1988.0,9601.0
4,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,6.282,1986.0,8445.0
5,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,18.734,1995.0,116269.0


In [140]:
#final cleaning
df_final[['rating', 'overview']].isna().sum()


rating         0
overview    7109
dtype: int64

In [141]:
df_final['overview'] = df_final['overview'].fillna('')
df_final['rating'] = df_final['rating'].fillna(0.0)

In [142]:
df_final = df_final[df_final['rating'] >= 1.0]
df_final.shape

(177556, 10)

In [143]:
#reset index
df_final = df_final.reset_index(drop=True)
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path,popularity,year,numVotes
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,2.852,1938.0,929.0
1,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,3.77,1962.0,2559.0
2,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,9.214,1988.0,9601.0
3,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,6.282,1986.0,8445.0
4,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,18.734,1995.0,116269.0


In [144]:
df_final[['rating', 'overview']].isna().sum()


rating      0
overview    0
dtype: int64

In [145]:
df_actors_grouped = (
    df_actors
    .groupby('tconst')['primaryName']
    .apply(lambda x: ', '.join(x.astype(str).unique()))
    .reset_index()
    .rename(columns={
        'tconst': 'imdb_id',
        'primaryName': 'actors'
    })
)
df_final = df_final.merge(
    df_actors_grouped,
    on='imdb_id',
    how='left'
)




In [146]:
df_final['actors'] = df_final['actors'].fillna('')
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path,popularity,year,numVotes,actors
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,2.852,1938.0,929.0,"Penny Singleton, Arthur Lake, Larry Simms, Dai..."
1,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,3.77,1962.0,2559.0,"Jean-Pierre Léaud, Marie-France Pisier, Eleono..."
2,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,9.214,1988.0,9601.0,"Turo Pajala, Susanna Haavisto, Matti Pellonpää..."
3,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,6.282,1986.0,8445.0,"Matti Pellonpää, Kati Outinen, Sakari Kuosmane..."
4,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,18.734,1995.0,116269.0,"Tim Roth, Antonio Banderas, Sammi Davis, Amand..."


In [147]:
#add producers
df_final = df_final.merge(
    df_producers_grouped.rename(columns={'tconst': 'imdb_id'}),
    on='imdb_id',
    how='left'
)
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path,popularity,year,numVotes,actors,producers
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,2.852,1938.0,929.0,"Penny Singleton, Arthur Lake, Larry Simms, Dai...",
1,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,3.77,1962.0,2559.0,"Jean-Pierre Léaud, Marie-France Pisier, Eleono...",Pierre Roustang
2,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,9.214,1988.0,9601.0,"Turo Pajala, Susanna Haavisto, Matti Pellonpää...",Aki Kaurismäki
3,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,6.282,1986.0,8445.0,"Matti Pellonpää, Kati Outinen, Sakari Kuosmane...",Mika Kaurismäki
4,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,18.734,1995.0,116269.0,"Tim Roth, Antonio Banderas, Sammi Davis, Amand...",Lawrence Bender


In [148]:
df_final['producers'] = df_final['producers'].fillna('')
df_final.isna().sum()


imdb_id           0
title             0
genres            0
overview          0
rating            0
runtime           0
poster_path    9443
popularity        0
year            136
numVotes          0
actors            0
producers         0
dtype: int64

In [149]:
df_final['poster_path'] = df_final['poster_path'].fillna('')
df_final['year'] = df_final['year'].fillna('')
df_final.isna().sum()

imdb_id        0
title          0
genres         0
overview       0
rating         0
runtime        0
poster_path    0
popularity     0
year           0
numVotes       0
actors         0
producers      0
dtype: int64

In [150]:
df_final.to_csv('../data/movie.csv', index=False)
df_final.head()

Unnamed: 0,imdb_id,title,genres,overview,rating,runtime,poster_path,popularity,year,numVotes,actors,producers
0,tt0029927,Blondie,['Comedy'],Blondie and Dagwood are about to celebrate the...,7.214,70,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,2.852,1938.0,929.0,"Penny Singleton, Arthur Lake, Larry Simms, Dai...",
1,tt0055747,Love at Twenty,"['Drama', 'Romance']",Love at Twenty unites five directors from five...,6.7,110,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,3.77,1962.0,2559.0,"Jean-Pierre Léaud, Marie-France Pisier, Eleono...",Pierre Roustang
2,tt0094675,Ariel,"['Drama', 'Comedy', 'Crime']",Taisto Kasurinen is a Finnish coal miner whose...,7.046,73,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,9.214,1988.0,9601.0,"Turo Pajala, Susanna Haavisto, Matti Pellonpää...",Aki Kaurismäki
3,tt0092149,Shadows in Paradise,"['Drama', 'Comedy', 'Romance']","An episode in the life of Nikander, a garbage ...",7.182,76,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,6.282,1986.0,8445.0,"Matti Pellonpää, Kati Outinen, Sakari Kuosmane...",Mika Kaurismäki
4,tt0113101,Four Rooms,"['Crime', 'Comedy']",It's Ted the Bellhop's first night on the job....,5.758,98,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,18.734,1995.0,116269.0,"Tim Roth, Antonio Banderas, Sammi Davis, Amand...",Lawrence Bender


In [151]:
df_check = pd.read_csv('../data/movie.csv')
df_check.isna().sum()
df_check.shape


(177556, 12)

In [None]:
df_final.to_csv("../data/df_ml_ready.csv", index=False)

NameError: name 'df_final' is not defined


KeyboardInterrupt



In [None]:


import requests
import time


TMDB_API_KEY = "secrets["TMDB_API_KEY"]"

def get_french_overview(imdb_id):
    """Функция для получения описания на французском через API"""
    try:
        # recherche TMDB ID et IMDb ID
        url_find = f"https://api.themoviedb.org/3/find/{imdb_id}?api_key={TMDB_API_KEY}&external_source=imdb_id"
        res = requests.get(url_find).json()
        
        movie_results = res.get('movie_results', [])
        if not movie_results:
            return None
        
        tmdb_id = movie_results[0]['id']
        
        
        url_detail = f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={TMDB_API_KEY}&language=fr-FR"
        movie_data = requests.get(url_detail).json()
        
        return movie_data.get('overview')
    except:
        return None




#example popularity
df_to_translate = df_final.sort_values(by='popularity', ascending=False).head(1000)

for idx, row in df_to_translate.iterrows():
    fr_desc = get_french_overview(row['imdb_id'])
    if fr_desc and len(fr_desc) > 10: 
        df_final.at[idx, 'overview'] = fr_desc
    
    # api (40 demandes en 10 sec)
    time.sleep(0.1) 

print("traduction terminée!")


traduction terminée!


In [6]:
# filtration
df_ml_ready = df_final[
    (df_final['rating'] >= 6.0) & 
    (df_final['numVotes'] >= 1000)
].copy()

# les donnees pour STREAMLIT

# acteurs si nessasaire
if 'actors' not in df_ml_ready.columns and 'df_actors_grouped' in locals():
    df_ml_ready = df_ml_ready.merge(df_actors_grouped, on='imdb_id', how='left')

# les posters
def make_poster_url(path):
    if pd.isna(path) or path == "" or str(path) == "nan":
        return "https://via.placeholder.com/300x450?text=No+Poster"
    if str(path).startswith('http'):
        return path
    return f"https://image.tmdb.org/t/p/w500{path}"

df_ml_ready['poster_url'] = df_ml_ready['poster_path'].apply(make_poster_url)

# text pour ML (KNN)
df_ml_ready['genres_text'] = df_ml_ready['genres'].astype(str).str.replace(r"[\[\]']", "", regex=True)

# remplacer NAN
df_ml_ready['overview'] = df_ml_ready['overview'].fillna("Description bientôt disponible.")
df_ml_ready['actors'] = df_ml_ready['actors'].fillna("Casting non disponible")

# sevgarde
df_ml_ready.to_csv('../data/df_ml_ready.csv', index=False)
df_ml_ready.to_csv('../data/movie.csv', index=False)

print(f"✅ sevgarder {len(df_ml_ready)} films.")
print("posters, acteurs, genres prettes pour ML.")

NameError: name 'df_final' is not defined

In [5]:
df_ml_ready.info()

NameError: name 'df_ml_ready' is not defined

In [7]:
print(df_ml_ready[['title', 'imdb_id']].head())

NameError: name 'df_ml_ready' is not defined

In [163]:
# Проверим, есть ли хоть одно длинное описание (французские обычно длиннее "No description")
print(df_ml_ready['overview'].str.len().describe())

# Посмотрим на первые 5 строк, где должен быть перевод
print(df_ml_ready.sort_values(by='popularity', ascending=False)[['title', 'overview']].head())

count    32467.000000
mean       288.857640
std        172.239963
min          0.000000
25%        157.000000
50%        244.000000
75%        382.000000
max       1002.000000
Name: overview, dtype: float64
                                    title  \
23537                 The Pope's Exorcist   
30260   Ant-Man and the Wasp: Quantumania   
43941         The Super Mario Bros. Movie   
100982           Avatar: The Way of Water   
18663                             Ghosted   

                                                 overview  
23537   Inspiré des véritables archives du Père Gabrie...  
30260   Tout va pour le mieux : Scott a écrit un livre...  
43941   Alors qu’ils tentent de réparer une canalisati...  
100982  Une dizaine d'années s'est écoulée depuis les ...  
18663   Cole, un agriculteur sans histoire, tombe éper...  
