In [3]:
pip install gdown


Note: you may need to restart the kernel to use updated packages.


In [4]:
from datetime import datetime
from datetime import timedelta
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import gdown

In [5]:
# ID du fichier Google Drive
file_id_tmdb = '1SSeKdAQQqU6QbWwKfiKNcqNEqHxbwGm3'
url_tmdb = f'https://drive.google.com/uc?id={file_id_tmdb}'

# Chemin de sauvegarde du fichier téléchargé
output = 'tmdb.csv'

# Télécharger le fichier
gdown.download(url_tmdb, output, quiet=False)

# Lire le fichier CSV avec pandas
try:
    df_tmdb = pd.read_csv(output, delimiter=',', on_bad_lines='skip', engine='python')
    if df_tmdb.empty:
        print("Le fichier est vide.")
    else:
        # Afficher les premières lignes du DataFrame
        display(df_tmdb.head())
except pd.errors.EmptyDataError:
    print("Aucune donnée à analyser dans le fichier.")
except Exception as e:
    print(f"Une erreur est survenue : {e}")


Downloading...
From (original): https://drive.google.com/uc?id=1SSeKdAQQqU6QbWwKfiKNcqNEqHxbwGm3
From (redirected): https://drive.google.com/uc?id=1SSeKdAQQqU6QbWwKfiKNcqNEqHxbwGm3&confirm=t&uuid=33a51c19-30d9-41ae-af25-d32dcde7d975
To: c:\Users\Win10\Desktop\projet 2 bdd\tmdb.csv
100%|██████████| 157M/157M [00:15<00:00, 10.3MB/s] 


Unnamed: 0,adult,backdrop_path,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,production_companies_name,production_companies_country
0,False,/dvQj1GBZAZirz1skEEZyWH2ZqQP.jpg,0,['Comedy'],,3924,tt0029927,en,Blondie,Blondie and Dagwood are about to celebrate the...,...,70,['en'],Released,The favorite comic strip of millions at last o...,Blondie,False,7.214,7,['Columbia Pictures'],['US']
1,False,,0,['Adventure'],,6124,tt0011436,de,Der Mann ohne Namen,,...,420,[],Released,,"Peter Voss, Thief of Millions",False,0.0,0,[],[]
2,False,/uJlc4aNPF3Y8yAqahJTKBwgwPVW.jpg,0,"['Drama', 'Romance']",,8773,tt0055747,fr,L'Amour à vingt ans,Love at Twenty unites five directors from five...,...,110,"['it', 'ja', 'pl', 'fr', 'de']",Released,The Intimate Secrets of Young Lovers,Love at Twenty,False,6.7,41,"['Ulysse Productions', 'Unitec Films', 'Cinese...","['', 'NZ', 'IT', 'JP', 'DE', 'PL', '']"
3,False,/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg,0,"['Drama', 'Comedy', 'Crime']",,2,tt0094675,fi,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,...,73,['fi'],Released,,Ariel,False,7.046,248,['Villealfa Filmproductions'],['FI']
4,False,/l94l89eMmFKh7na2a1u5q67VgNx.jpg,0,"['Drama', 'Comedy', 'Romance']",,3,tt0092149,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",...,76,['en'],Released,,Shadows in Paradise,False,7.182,269,['Villealfa Filmproductions'],['FI']


In [6]:
df_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309572 entries, 0 to 309571
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   adult                         309572 non-null  bool   
 1   backdrop_path                 151760 non-null  object 
 2   budget                        309572 non-null  int64  
 3   genres                        309572 non-null  object 
 4   homepage                      44262 non-null   object 
 5   id                            309572 non-null  int64  
 6   imdb_id                       309572 non-null  object 
 7   original_language             309572 non-null  object 
 8   original_title                309572 non-null  object 
 9   overview                      282512 non-null  object 
 10  popularity                    309572 non-null  float64
 11  poster_path                   264159 non-null  object 
 12  production_countries          309572 non-nul

In [7]:
# Vérifier un échantillon du contenu de la colonne 'genres'
print(df_tmdb['genres'].head())

0                        ['Comedy']
1                     ['Adventure']
2              ['Drama', 'Romance']
3      ['Drama', 'Comedy', 'Crime']
4    ['Drama', 'Comedy', 'Romance']
Name: genres, dtype: object


In [8]:
# Convertir chaque élément de la colonne 'genres' en une chaîne de caractères si nécessaire
# Supposons que chaque élément est une chaîne de caractères ou une liste de chaînes de caractères
# Vous pouvez ajuster la logique de conversion en fonction du contenu réel

def convert_genres(genres):
    if isinstance(genres, list):
        return ' '.join(genres)
    elif isinstance(genres, str):
        return genres
    else:
        return str(genres)

df_tmdb['genres'] = df_tmdb['genres'].apply(convert_genres)
# Vérifier que la conversion a réussi
print(df_tmdb['genres'].head())

0                        ['Comedy']
1                     ['Adventure']
2              ['Drama', 'Romance']
3      ['Drama', 'Comedy', 'Crime']
4    ['Drama', 'Comedy', 'Romance']
Name: genres, dtype: object


In [9]:
# Filtrer les lignes où 'genres' contient 'Adventure' ou 'Comedy'
df_tmdb_filtered = df_tmdb[df_tmdb['genres'].str.contains('Adventure|Comedy', na=False)]

# Afficher les 15 premières lignes du DataFrame filtré
display(df_tmdb_filtered.head(2))

Unnamed: 0,adult,backdrop_path,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,production_companies_name,production_companies_country
0,False,/dvQj1GBZAZirz1skEEZyWH2ZqQP.jpg,0,['Comedy'],,3924,tt0029927,en,Blondie,Blondie and Dagwood are about to celebrate the...,...,70,['en'],Released,The favorite comic strip of millions at last o...,Blondie,False,7.214,7,['Columbia Pictures'],['US']
1,False,,0,['Adventure'],,6124,tt0011436,de,Der Mann ohne Namen,,...,420,[],Released,,"Peter Voss, Thief of Millions",False,0.0,0,[],[]


In [10]:
# Filtrer les lignes où 'spoken_languages' contient 'fr'
df_tmdb_fr = df_tmdb_filtered[df_tmdb_filtered['spoken_languages'].str.contains('fr', na=False)]
display(df_tmdb_fr)

Unnamed: 0,adult,backdrop_path,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,production_companies_name,production_companies_country
44,False,/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg,70000000,"['Comedy', 'Fantasy', 'Science Fiction']",https://www.warnerbros.com/movies/mars-attacks/,75,tt0116996,en,Mars Attacks!,A fleet of Martian spacecraft surrounds the wo...,...,106,"['en', 'fr']",Released,Nice planet. We'll take it!,Mars Attacks!,False,6.394,4864,"['Tim Burton Productions', 'Warner Bros. Pictu...","['', 'US']"
77,False,/2eBL3wXDZqemW8SJ1RNxqtx6bgJ.jpg,0,"['Comedy', 'Drama', 'Mystery']",,109,tt0111507,fr,Trois couleurs : Blanc,Polish immigrant Karol Karol finds himself out...,...,92,"['fr', 'hu', 'pl']",Released,,Three Colors: White,False,7.482,989,"['Le Studio Canal+', 'France 3 Cinéma', 'Eurim...","['FR', 'FR', 'FR', 'US']"
102,False,/oT9diLGD1cgPdvkxqHIJcuSXKUT.jpg,14600000,"['Romance', 'Fantasy', 'Drama', 'Comedy']",,137,tt0107048,en,Groundhog Day,"A narcissistic TV weatherman, along with his a...",...,101,"['en', 'fr', 'it']",Released,He’s having the worst day of his life… Over an...,Groundhog Day,False,7.603,6963,['Columbia Pictures'],['US']
130,False,/9B3NnLM7Uct4eSB5Th9aqVQa8uT.jpg,0,"['Drama', 'Romance', 'Comedy']",,166,tt0082100,fr,La Boum,A thirteen-year-old French girl deals with mov...,...,110,['fr'],Released,,The Party,False,6.605,1019,"['Productions Marcel Dassault', 'Gaumont', 'Ga...","['FR', 'FR', 'FR']"
135,False,/3oNDTUxv2lLWMqt5R12A0vM5dBn.jpg,0,"['Comedy', 'Romance', 'Drama']",,171,tt0083686,fr,La Boum 2,A young French teenage girl after moving to a ...,...,109,['fr'],Released,,The Party 2,False,6.454,570,"['Productions Marcel Dassault', 'Gaumont', 'Ga...","['FR', 'FR', 'FR']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308128,False,,0,"['Drama', 'Comedy']",,225255,tt0468834,lb,Perl oder Pica,"October 1962, the height of the Cuban missile ...",...,90,"['fr', 'lb']",Released,,Little Secrets,False,5.300,9,"['Red Lion', 'Amour Fou Filmproduktion']",
308260,False,/z9kgYGIZbp4vI4LwOrMM4kCe1Je.jpg,0,"['Comedy', 'TV Movie']",,226288,tt0313313,fr,Le Bourgeois gentilhomme,Le Bourgeois gentilhomme satirizes attempts at...,...,130,['fr'],Released,,Would-Be Gentleman,False,0.000,0,['Office de Radiodiffusion Télévision Française'],
308324,False,,0,['Comedy'],,226855,tt0277844,fr,Max en convalescence,"To reassure his fans, Max Linder has been film...",...,11,['fr'],Released,,Max Is Convalescent,False,6.200,8,['Pathé Frères'],
308413,False,,0,"['Comedy', 'TV Movie']",,227460,tt0303166,fr,Ubu Roi,Based on Alfredy Jarrry's 1896 play about a gr...,...,95,['fr'],Released,,Ubu Roi,False,5.100,5,['Office de Radiodiffusion Télévision Française'],


In [11]:
# Obtenir les valeurs uniques dans la colonne 'original_language'
fr_en = df_tmdb_fr['original_language'].unique()
print(fr_en)

['en' 'fr' 'it' 'de' 'el' 'sv' 'cs' 'nl' 'ar' 'es' 'ro' 'ja' 'cn' 'sr'
 'bo' 'hi' 'da' 'pl' 'ru' 'he' 'is' 'tr' 'et' 'pt' 'eu' 'fi' 'uk' 'fa'
 'hu' 'ca' 'zh' 'no' 'lb' 'gd' 'ko' 'th' 'hy' 'ka' 'wo' 'cr' 'fo' 'ml'
 'sh' 'xx' 'id']


In [12]:
# Obtenir les valeurs uniques dans la colonne 'status'
stat = df_tmdb_fr['status'].unique()
print(stat)

['Released' 'Planned' 'In Production' 'Post Production']


In [13]:
# Filtrer les lignes où 'status' contient 'Released'
df_tmdb_fr = df_tmdb_fr[df_tmdb_fr['status'].str.contains('Released', na=False)]
display(df_tmdb_fr)

Unnamed: 0,adult,backdrop_path,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,production_companies_name,production_companies_country
44,False,/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg,70000000,"['Comedy', 'Fantasy', 'Science Fiction']",https://www.warnerbros.com/movies/mars-attacks/,75,tt0116996,en,Mars Attacks!,A fleet of Martian spacecraft surrounds the wo...,...,106,"['en', 'fr']",Released,Nice planet. We'll take it!,Mars Attacks!,False,6.394,4864,"['Tim Burton Productions', 'Warner Bros. Pictu...","['', 'US']"
77,False,/2eBL3wXDZqemW8SJ1RNxqtx6bgJ.jpg,0,"['Comedy', 'Drama', 'Mystery']",,109,tt0111507,fr,Trois couleurs : Blanc,Polish immigrant Karol Karol finds himself out...,...,92,"['fr', 'hu', 'pl']",Released,,Three Colors: White,False,7.482,989,"['Le Studio Canal+', 'France 3 Cinéma', 'Eurim...","['FR', 'FR', 'FR', 'US']"
102,False,/oT9diLGD1cgPdvkxqHIJcuSXKUT.jpg,14600000,"['Romance', 'Fantasy', 'Drama', 'Comedy']",,137,tt0107048,en,Groundhog Day,"A narcissistic TV weatherman, along with his a...",...,101,"['en', 'fr', 'it']",Released,He’s having the worst day of his life… Over an...,Groundhog Day,False,7.603,6963,['Columbia Pictures'],['US']
130,False,/9B3NnLM7Uct4eSB5Th9aqVQa8uT.jpg,0,"['Drama', 'Romance', 'Comedy']",,166,tt0082100,fr,La Boum,A thirteen-year-old French girl deals with mov...,...,110,['fr'],Released,,The Party,False,6.605,1019,"['Productions Marcel Dassault', 'Gaumont', 'Ga...","['FR', 'FR', 'FR']"
135,False,/3oNDTUxv2lLWMqt5R12A0vM5dBn.jpg,0,"['Comedy', 'Romance', 'Drama']",,171,tt0083686,fr,La Boum 2,A young French teenage girl after moving to a ...,...,109,['fr'],Released,,The Party 2,False,6.454,570,"['Productions Marcel Dassault', 'Gaumont', 'Ga...","['FR', 'FR', 'FR']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308128,False,,0,"['Drama', 'Comedy']",,225255,tt0468834,lb,Perl oder Pica,"October 1962, the height of the Cuban missile ...",...,90,"['fr', 'lb']",Released,,Little Secrets,False,5.300,9,"['Red Lion', 'Amour Fou Filmproduktion']",
308260,False,/z9kgYGIZbp4vI4LwOrMM4kCe1Je.jpg,0,"['Comedy', 'TV Movie']",,226288,tt0313313,fr,Le Bourgeois gentilhomme,Le Bourgeois gentilhomme satirizes attempts at...,...,130,['fr'],Released,,Would-Be Gentleman,False,0.000,0,['Office de Radiodiffusion Télévision Française'],
308324,False,,0,['Comedy'],,226855,tt0277844,fr,Max en convalescence,"To reassure his fans, Max Linder has been film...",...,11,['fr'],Released,,Max Is Convalescent,False,6.200,8,['Pathé Frères'],
308413,False,,0,"['Comedy', 'TV Movie']",,227460,tt0303166,fr,Ubu Roi,Based on Alfredy Jarrry's 1896 play about a gr...,...,95,['fr'],Released,,Ubu Roi,False,5.100,5,['Office de Radiodiffusion Télévision Française'],


In [14]:
#supprimer les colonnes 'budget', 'id', 'vote_count' et 'revenue'
tmdb_1= df_tmdb_fr.drop(columns=['budget', 'id', 'vote_count', 'revenue'])
display(tmdb_1.head(2))

Unnamed: 0,adult,backdrop_path,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,...,release_date,runtime,spoken_languages,status,tagline,title,video,vote_average,production_companies_name,production_companies_country
44,False,/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg,"['Comedy', 'Fantasy', 'Science Fiction']",https://www.warnerbros.com/movies/mars-attacks/,tt0116996,en,Mars Attacks!,A fleet of Martian spacecraft surrounds the wo...,21.051,/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg,...,1996-12-12,106,"['en', 'fr']",Released,Nice planet. We'll take it!,Mars Attacks!,False,6.394,"['Tim Burton Productions', 'Warner Bros. Pictu...","['', 'US']"
77,False,/2eBL3wXDZqemW8SJ1RNxqtx6bgJ.jpg,"['Comedy', 'Drama', 'Mystery']",,tt0111507,fr,Trois couleurs : Blanc,Polish immigrant Karol Karol finds himself out...,13.473,/fdIet3NSa27gobMbaUml66oCQNT.jpg,...,1994-01-26,92,"['fr', 'hu', 'pl']",Released,,Three Colors: White,False,7.482,"['Le Studio Canal+', 'France 3 Cinéma', 'Eurim...","['FR', 'FR', 'FR', 'US']"


In [15]:
tmdb_1.head(2)

Unnamed: 0,adult,backdrop_path,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,...,release_date,runtime,spoken_languages,status,tagline,title,video,vote_average,production_companies_name,production_companies_country
44,False,/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg,"['Comedy', 'Fantasy', 'Science Fiction']",https://www.warnerbros.com/movies/mars-attacks/,tt0116996,en,Mars Attacks!,A fleet of Martian spacecraft surrounds the wo...,21.051,/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg,...,1996-12-12,106,"['en', 'fr']",Released,Nice planet. We'll take it!,Mars Attacks!,False,6.394,"['Tim Burton Productions', 'Warner Bros. Pictu...","['', 'US']"
77,False,/2eBL3wXDZqemW8SJ1RNxqtx6bgJ.jpg,"['Comedy', 'Drama', 'Mystery']",,tt0111507,fr,Trois couleurs : Blanc,Polish immigrant Karol Karol finds himself out...,13.473,/fdIet3NSa27gobMbaUml66oCQNT.jpg,...,1994-01-26,92,"['fr', 'hu', 'pl']",Released,,Three Colors: White,False,7.482,"['Le Studio Canal+', 'France 3 Cinéma', 'Eurim...","['FR', 'FR', 'FR', 'US']"


In [16]:
# Convertir 'release_date' en format datetime (si ce n'est pas déjà fait)
tmdb_1['release_date'] = pd.to_datetime(tmdb_1['release_date'], errors='coerce')

# Extraire l'année de la colonne 'release_date'
tmdb_1['release_year'] = tmdb_1['release_date'].dt.year

# Afficher les premières lignes pour vérifier le résultat
print(tmdb_1[['release_date', 'release_year']].head())


    release_date  release_year
44    1996-12-12        1996.0
77    1994-01-26        1994.0
102   1993-02-11        1993.0
130   1980-12-17        1980.0
135   1982-12-08        1982.0


In [17]:
tmdb_1.head(2)

Unnamed: 0,adult,backdrop_path,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,...,runtime,spoken_languages,status,tagline,title,video,vote_average,production_companies_name,production_companies_country,release_year
44,False,/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg,"['Comedy', 'Fantasy', 'Science Fiction']",https://www.warnerbros.com/movies/mars-attacks/,tt0116996,en,Mars Attacks!,A fleet of Martian spacecraft surrounds the wo...,21.051,/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg,...,106,"['en', 'fr']",Released,Nice planet. We'll take it!,Mars Attacks!,False,6.394,"['Tim Burton Productions', 'Warner Bros. Pictu...","['', 'US']",1996.0
77,False,/2eBL3wXDZqemW8SJ1RNxqtx6bgJ.jpg,"['Comedy', 'Drama', 'Mystery']",,tt0111507,fr,Trois couleurs : Blanc,Polish immigrant Karol Karol finds himself out...,13.473,/fdIet3NSa27gobMbaUml66oCQNT.jpg,...,92,"['fr', 'hu', 'pl']",Released,,Three Colors: White,False,7.482,"['Le Studio Canal+', 'France 3 Cinéma', 'Eurim...","['FR', 'FR', 'FR', 'US']",1994.0


In [18]:
# Filtrer les lignes où 'release_year' est supérieur ou égal à 1990
tmdb_90 = tmdb_1[tmdb_1['release_year'] >= 1990]

tmdb_90.shape


(4080, 22)

In [19]:
tmdb_90.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4080 entries, 44 to 308128
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   adult                         4080 non-null   bool          
 1   backdrop_path                 2993 non-null   object        
 2   genres                        4080 non-null   object        
 3   homepage                      509 non-null    object        
 4   imdb_id                       4080 non-null   object        
 5   original_language             4080 non-null   object        
 6   original_title                4080 non-null   object        
 7   overview                      3661 non-null   object        
 8   popularity                    4080 non-null   float64       
 9   poster_path                   3930 non-null   object        
 10  production_countries          4080 non-null   object        
 11  release_date                  40

In [20]:
# ID du fichier Google Drive
file_id_table = '1BgXeRjXAJyZrAMY0oP3Y5pJ0xIW1clzf'

# URL pour télécharger le fichier
url_table = f'https://drive.google.com/uc?id={file_id_table}'

# Chemin de sauvegarde du fichier téléchargé
output = 'table_principale.csv'

# Télécharger le fichier
gdown.download(url_table, output, quiet=False)

# Lire le fichier CSV avec pandas
try:
    df_table = pd.read_csv(output, delimiter=',', on_bad_lines='skip', engine='python')
    if df_table.empty:
        print("Le fichier est vide.")
    else:
        # Afficher les premières lignes du DataFrame
        display(df_table.head())
except pd.errors.EmptyDataError:
    print("Aucune donnée à analyser dans le fichier.")
except Exception as e:
    print(f"Une erreur est survenue : {e}")


Downloading...
From: https://drive.google.com/uc?id=1BgXeRjXAJyZrAMY0oP3Y5pJ0xIW1clzf
To: c:\Users\Win10\Desktop\projet 2 bdd\table_principale.csv
100%|██████████| 1.29M/1.29M [00:00<00:00, 5.14MB/s]


Unnamed: 0,tconst,averageRating,numVotes,ordering,nconst,category,title,language,types,startYear,runtimeMinutes,genres,primaryName,birthYear
0,tt0139645,6.400675,54558.048108,1.0,nm0000103,actress,Life in the Fast Lane,en,imdbDisplay,1998.0,92.0,Comedy,Fairuza Balk,1974.0
1,tt0119304,5.1,9061.0,1.0,nm0000106,actress,Home Fries,en,imdbDisplay,1998.0,91.0,"Comedy,Drama,Romance",Drew Barrymore,1975.0
2,tt0120523,6.400675,54558.048108,1.0,nm0000106,actress,Wishful Thinking,en,imdbDisplay,1997.0,93.0,"Comedy,Romance",Drew Barrymore,1975.0
3,tt0151738,6.0,99378.0,1.0,nm0000106,actress,Never Been Kissed,en,imdbDisplay,1999.0,107.0,"Comedy,Drama,Romance",Drew Barrymore,1975.0
4,tt0200027,6.5,30914.0,1.0,nm0000106,actress,Riding in Cars with Boys,en,imdbDisplay,2001.0,132.0,"Biography,Comedy,Drama",Drew Barrymore,1975.0


In [21]:
# Renommer les colonnes
tmdb_90.rename(columns={'imdb_id': 'tconst', 'title': 'title_imdb'}, inplace=True)

# Afficher les premières lignes pour vérifier le changement
display(tmdb_90.head())


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmdb_90.rename(columns={'imdb_id': 'tconst', 'title': 'title_imdb'}, inplace=True)


Unnamed: 0,adult,backdrop_path,genres,homepage,tconst,original_language,original_title,overview,popularity,poster_path,...,runtime,spoken_languages,status,tagline,title_imdb,video,vote_average,production_companies_name,production_companies_country,release_year
44,False,/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg,"['Comedy', 'Fantasy', 'Science Fiction']",https://www.warnerbros.com/movies/mars-attacks/,tt0116996,en,Mars Attacks!,A fleet of Martian spacecraft surrounds the wo...,21.051,/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg,...,106,"['en', 'fr']",Released,Nice planet. We'll take it!,Mars Attacks!,False,6.394,"['Tim Burton Productions', 'Warner Bros. Pictu...","['', 'US']",1996.0
77,False,/2eBL3wXDZqemW8SJ1RNxqtx6bgJ.jpg,"['Comedy', 'Drama', 'Mystery']",,tt0111507,fr,Trois couleurs : Blanc,Polish immigrant Karol Karol finds himself out...,13.473,/fdIet3NSa27gobMbaUml66oCQNT.jpg,...,92,"['fr', 'hu', 'pl']",Released,,Three Colors: White,False,7.482,"['Le Studio Canal+', 'France 3 Cinéma', 'Eurim...","['FR', 'FR', 'FR', 'US']",1994.0
102,False,/oT9diLGD1cgPdvkxqHIJcuSXKUT.jpg,"['Romance', 'Fantasy', 'Drama', 'Comedy']",,tt0107048,en,Groundhog Day,"A narcissistic TV weatherman, along with his a...",21.795,/qIwyDf5k0eNsrfQjd0HCkeraNGD.jpg,...,101,"['en', 'fr', 'it']",Released,He’s having the worst day of his life… Over an...,Groundhog Day,False,7.603,['Columbia Pictures'],['US'],1993.0
155,False,/9Y9K6LeLrMeofOvX7hZW36Aj3OG.jpg,"['Comedy', 'Romance']",https://www.miramax.com/movie/amelie/,tt0211915,fr,Le Fabuleux Destin d'Amélie Poulain,"At a tiny Parisian café, the adorable yet pain...",26.104,/oTKduWL2tpIKEmkAqF4mFEAWAsv.jpg,...,122,"['fr', 'ru']",Released,One person can change your life forever.,Amélie,False,7.902,"['France 3 Cinéma', 'Claudie Ossard Production...","['FR', '', '', '', '', '', 'FR', 'DE', 'FR', '...",2001.0
251,False,/bHePzkyRcMhnab2qZbhj1bCElnf.jpg,"['Comedy', 'Drama', 'Fantasy']",http://www.lasciencedesreves-lefilm.com/accuei...,tt0354899,fr,La Science des rêves,A man entranced by his dreams and imagination ...,8.741,/1qCq228LsNtUseCnNE7Naw6NBUz.jpg,...,105,"['en', 'fr', 'es']",Released,Close your eyes. Open your heart.,The Science of Sleep,False,7.052,"['Partizan Films', 'Gaumont', 'France 3 Cinéma...","['', 'FR', 'FR', 'FR', 'IT', 'FR']",2006.0


In [22]:
# Fusionner les deux DataFrames sur la colonne 'tconst'
merged_df = pd.merge(tmdb_90, df_table, on='tconst', how='outer')

# Afficher les premières lignes pour vérifier la fusion
display(merged_df.head())


Unnamed: 0,adult,backdrop_path,genres_x,homepage,tconst,original_language,original_title,overview,popularity,poster_path,...,nconst,category,title,language,types,startYear,runtimeMinutes,genres_y,primaryName,birthYear
0,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,"['Romance', 'Fantasy', 'Comedy']",,tt0035423,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,...,,,,,,,,,,
1,,,,,tt0096222,,,,,,...,nm0510903,actor,To Miss with Love,en,imdbDisplay,1992.0,90.0,"Comedy,Drama,Romance",Jimmy Lin,1974.0
2,False,/6tRdTNdcmdBlTGyJ924ISyJDABA.jpg,"['Comedy', 'Drama', 'Romance']",,tt0097106,fr,Conte de printemps,The story of an introverted young girl just r...,9.163,/bP2qpuzQPBBBJa4kOhvLvOOiASq.jpg,...,,,,,,,,,,
3,False,/uEPLeurEn2dp54f5owtMC3PeiPN.jpg,"['Crime', 'Action', 'Adventure']",,tt0097318,en,Eye of the Widow,A new international terrorist group attack the...,0.742,/1Ela9pkCRCPugwasp8hdR1BpfXb.jpg,...,,,,,,,,,,
4,False,/inHgpiOAddL4Y53EvHf9B5lf2Sc.jpg,"['Drama', 'Adventure']",,tt0097462,fr,Guerriers et Captives,"Patagonia, Argentina, 1880s. During the Conque...",0.96,/ytYs98qqVmiY45cxkqNgJbj8uDL.jpg,...,,,,,,,,,,


In [23]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12379 entries, 0 to 12378
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   adult                         4080 non-null   object        
 1   backdrop_path                 2993 non-null   object        
 2   genres_x                      4080 non-null   object        
 3   homepage                      509 non-null    object        
 4   tconst                        12379 non-null  object        
 5   original_language             4080 non-null   object        
 6   original_title                4080 non-null   object        
 7   overview                      3661 non-null   object        
 8   popularity                    4080 non-null   float64       
 9   poster_path                   3930 non-null   object        
 10  production_countries          4080 non-null   object        
 11  release_date                

In [24]:
merged_df.columns

Index(['adult', 'backdrop_path', 'genres_x', 'homepage', 'tconst',
       'original_language', 'original_title', 'overview', 'popularity',
       'poster_path', 'production_countries', 'release_date', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_imdb', 'video',
       'vote_average', 'production_companies_name',
       'production_companies_country', 'release_year', 'averageRating',
       'numVotes', 'ordering', 'nconst', 'category', 'title', 'language',
       'types', 'startYear', 'runtimeMinutes', 'genres_y', 'primaryName',
       'birthYear'],
      dtype='object')

In [25]:
# Combiner les valeurs des deux colonnes 'genre' en une seule
merged_df['genres'] = merged_df['genres_x'].fillna('') + ',' + merged_df['genres_y'].fillna('')

In [26]:
# Supprimer les colonnes supplémentaires 'genre_tmdb' et 'genre_table'
merged_df = merged_df.drop(columns=['genres_x', 'genres_y'])

# Afficher les premières lignes pour vérifier la fusion
display(merged_df.head())

Unnamed: 0,adult,backdrop_path,homepage,tconst,original_language,original_title,overview,popularity,poster_path,production_countries,...,nconst,category,title,language,types,startYear,runtimeMinutes,primaryName,birthYear,genres
0,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,,tt0035423,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,['US'],...,,,,,,,,,,"['Romance', 'Fantasy', 'Comedy'],"
1,,,,tt0096222,,,,,,,...,nm0510903,actor,To Miss with Love,en,imdbDisplay,1992.0,90.0,Jimmy Lin,1974.0,",Comedy,Drama,Romance"
2,False,/6tRdTNdcmdBlTGyJ924ISyJDABA.jpg,,tt0097106,fr,Conte de printemps,The story of an introverted young girl just r...,9.163,/bP2qpuzQPBBBJa4kOhvLvOOiASq.jpg,['FR'],...,,,,,,,,,,"['Comedy', 'Drama', 'Romance'],"
3,False,/uEPLeurEn2dp54f5owtMC3PeiPN.jpg,,tt0097318,en,Eye of the Widow,A new international terrorist group attack the...,0.742,/1Ela9pkCRCPugwasp8hdR1BpfXb.jpg,"['ES', 'FR']",...,,,,,,,,,,"['Crime', 'Action', 'Adventure'],"
4,False,/inHgpiOAddL4Y53EvHf9B5lf2Sc.jpg,,tt0097462,fr,Guerriers et Captives,"Patagonia, Argentina, 1880s. During the Conque...",0.96,/ytYs98qqVmiY45cxkqNgJbj8uDL.jpg,"['AR', 'FR', 'CH']",...,,,,,,,,,,"['Drama', 'Adventure'],"


In [27]:
merged_df.shape

(12379, 34)

In [28]:
# Trouver les NaN dans tout le DataFrame
nan_counts = merged_df.isna().sum()

# Afficher le nombre de NaN par colonne
print(nan_counts)


adult                            8299
backdrop_path                    9386
homepage                        11870
tconst                              0
original_language                8299
original_title                   8299
overview                         8718
popularity                       8299
poster_path                      8449
production_countries             8299
release_date                     8299
runtime                          8299
spoken_languages                 8299
status                           8299
tagline                         11715
title_imdb                       8299
video                            8299
vote_average                     8299
production_companies_name        8299
production_companies_country    10169
release_year                     8299
averageRating                    3462
numVotes                         3462
ordering                         3462
nconst                           3462
category                         3462
title       

Supprimer les colonnes 'homepage', 'tagline', 'production_companies_country' car nombre de NaN > 10000

In [29]:
# Supprimer les colonnes spécifiées
merged_df = merged_df.drop(columns=['homepage', 'tagline', 'production_companies_country'])

In [47]:
# Combiner les colonnes 'title_imdb' et 'title' en une seule colonne
#merged_df['titles'] = merged_df['title_imdb'].fillna('') + ' - ' + merged_df['title'].fillna('')

# Supprimer les colonnes 'title_imdb' et 'title'
#merged_df = merged_df.drop(columns=['title_imdb', 'title'])

# Afficher les premières lignes pour vérifier
display(merged_df.head())


Unnamed: 0,adult,backdrop_path,tconst,overview,popularity,poster_path,production_countries,status,production_companies_name,averageRating,ordering,category,primaryName,birthYear,genres,titles,languages_combined,votes,runtime,start_year
0,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,tt0035423,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,['US'],Released,"['Konrad Pictures', 'Miramax']",,,,,,"['Romance', 'Fantasy', 'Comedy'],",Kate & Leopold -,"['en', 'fr', 'it'],",6.326 /,/ 118.0,2001.0 -
1,,,tt0096222,,,,,,,6.400675,1.0,actor,Jimmy Lin,1974.0,",Comedy,Drama,Romance",- To Miss with Love,", en",/ 54558.04810794954,90.0 /,- 1992.0
2,False,/6tRdTNdcmdBlTGyJ924ISyJDABA.jpg,tt0097106,The story of an introverted young girl just r...,9.163,/bP2qpuzQPBBBJa4kOhvLvOOiASq.jpg,['FR'],Released,"['Les Films du Losange', 'Investimage', 'Compa...",,,,,,"['Comedy', 'Drama', 'Romance'],",A Tale of Springtime -,"['fr'],",6.9 /,/ 103.0,1990.0 -
3,False,/uEPLeurEn2dp54f5owtMC3PeiPN.jpg,tt0097318,A new international terrorist group attack the...,0.742,/1Ela9pkCRCPugwasp8hdR1BpfXb.jpg,"['ES', 'FR']",Released,"['New Vision Films', 'Eridan Films']",,,,,,"['Crime', 'Action', 'Adventure'],",Eye of the Widow -,"['ar', 'en', 'fr', 'es'],",3.0 /,/ 95.0,1991.0 -
4,False,/inHgpiOAddL4Y53EvHf9B5lf2Sc.jpg,tt0097462,"Patagonia, Argentina, 1880s. During the Conque...",0.96,/ytYs98qqVmiY45cxkqNgJbj8uDL.jpg,"['AR', 'FR', 'CH']",Released,"['INCAA', 'JEMPSA', 'TSR', 'Les Films du phare...",,,,,,"['Drama', 'Adventure'],",Warriors and Prisoners -,"['fr', 'es'],",0.0 /,/ 99.0,1990.0 -


In [33]:
merged_df.columns

Index(['adult', 'backdrop_path', 'tconst', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'video', 'vote_average', 'production_companies_name',
       'release_year', 'averageRating', 'numVotes', 'ordering', 'nconst',
       'category', 'language', 'types', 'startYear', 'runtimeMinutes',
       'primaryName', 'birthYear', 'genres', 'titles'],
      dtype='object')

In [34]:
# Trouver les NaN dans tout le DataFrame
nan_counts = merged_df.isna().sum()

# Afficher le nombre de NaN par colonne
print(nan_counts)


adult                        8299
backdrop_path                9386
tconst                          0
original_language            8299
original_title               8299
overview                     8718
popularity                   8299
poster_path                  8449
production_countries         8299
release_date                 8299
runtime                      8299
spoken_languages             8299
status                       8299
video                        8299
vote_average                 8299
production_companies_name    8299
release_year                 8299
averageRating                3462
numVotes                     3462
ordering                     3462
nconst                       3462
category                     3462
language                     3462
types                        3462
startYear                    3462
runtimeMinutes               3462
primaryName                  3462
birthYear                    3462
genres                          0
titles        

In [35]:
# Combiner les colonnes 'spoken_languages' et 'language' en une seule colonne
merged_df['languages_combined'] = merged_df['spoken_languages'].fillna('') + ', ' + merged_df['language'].fillna('')

# Supprimer les colonnes 'spoken_languages' et 'language'
merged_df = merged_df.drop(columns=['spoken_languages', 'language'])

# Afficher les premières lignes pour vérifier
display(merged_df.head())


Unnamed: 0,adult,backdrop_path,tconst,original_language,original_title,overview,popularity,poster_path,production_countries,release_date,...,nconst,category,types,startYear,runtimeMinutes,primaryName,birthYear,genres,titles,languages_combined
0,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,tt0035423,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,['US'],2001-12-25,...,,,,,,,,"['Romance', 'Fantasy', 'Comedy'],",Kate & Leopold -,"['en', 'fr', 'it'],"
1,,,tt0096222,,,,,,,NaT,...,nm0510903,actor,imdbDisplay,1992.0,90.0,Jimmy Lin,1974.0,",Comedy,Drama,Romance",- To Miss with Love,", en"
2,False,/6tRdTNdcmdBlTGyJ924ISyJDABA.jpg,tt0097106,fr,Conte de printemps,The story of an introverted young girl just r...,9.163,/bP2qpuzQPBBBJa4kOhvLvOOiASq.jpg,['FR'],1990-04-04,...,,,,,,,,"['Comedy', 'Drama', 'Romance'],",A Tale of Springtime -,"['fr'],"
3,False,/uEPLeurEn2dp54f5owtMC3PeiPN.jpg,tt0097318,en,Eye of the Widow,A new international terrorist group attack the...,0.742,/1Ela9pkCRCPugwasp8hdR1BpfXb.jpg,"['ES', 'FR']",1991-10-17,...,,,,,,,,"['Crime', 'Action', 'Adventure'],",Eye of the Widow -,"['ar', 'en', 'fr', 'es'],"
4,False,/inHgpiOAddL4Y53EvHf9B5lf2Sc.jpg,tt0097462,fr,Guerriers et Captives,"Patagonia, Argentina, 1880s. During the Conque...",0.96,/ytYs98qqVmiY45cxkqNgJbj8uDL.jpg,"['AR', 'FR', 'CH']",1990-09-05,...,,,,,,,,"['Drama', 'Adventure'],",Warriors and Prisoners -,"['fr', 'es'],"


In [48]:
# Trouver les NaN dans tout le DataFrame
nan_counts = merged_df.isna().sum()

# Afficher le nombre de NaN par colonne
print(nan_counts)

adult                        8299
backdrop_path                9386
tconst                          0
overview                     8718
popularity                   8299
poster_path                  8449
production_countries         8299
status                       8299
production_companies_name    8299
averageRating                3462
ordering                     3462
category                     3462
primaryName                  3462
birthYear                    3462
genres                          0
titles                          0
languages_combined              0
votes                           0
runtime                         0
start_year                      0
dtype: int64


In [37]:
# Combiner les colonnes 'vote_average' et 'numVotes' en une seule colonne
merged_df['votes'] = merged_df['vote_average'].fillna('').astype(str) + ' / ' + merged_df['numVotes'].fillna('').astype(str)

# Supprimer les colonnes 'vote_average' et 'numVotes'
merged_df = merged_df.drop(columns=['vote_average', 'numVotes'])

# Afficher les premières lignes pour vérifier
display(merged_df.head())


Unnamed: 0,adult,backdrop_path,tconst,original_language,original_title,overview,popularity,poster_path,production_countries,release_date,...,category,types,startYear,runtimeMinutes,primaryName,birthYear,genres,titles,languages_combined,votes
0,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,tt0035423,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,['US'],2001-12-25,...,,,,,,,"['Romance', 'Fantasy', 'Comedy'],",Kate & Leopold -,"['en', 'fr', 'it'],",6.326 /
1,,,tt0096222,,,,,,,NaT,...,actor,imdbDisplay,1992.0,90.0,Jimmy Lin,1974.0,",Comedy,Drama,Romance",- To Miss with Love,", en",/ 54558.04810794954
2,False,/6tRdTNdcmdBlTGyJ924ISyJDABA.jpg,tt0097106,fr,Conte de printemps,The story of an introverted young girl just r...,9.163,/bP2qpuzQPBBBJa4kOhvLvOOiASq.jpg,['FR'],1990-04-04,...,,,,,,,"['Comedy', 'Drama', 'Romance'],",A Tale of Springtime -,"['fr'],",6.9 /
3,False,/uEPLeurEn2dp54f5owtMC3PeiPN.jpg,tt0097318,en,Eye of the Widow,A new international terrorist group attack the...,0.742,/1Ela9pkCRCPugwasp8hdR1BpfXb.jpg,"['ES', 'FR']",1991-10-17,...,,,,,,,"['Crime', 'Action', 'Adventure'],",Eye of the Widow -,"['ar', 'en', 'fr', 'es'],",3.0 /
4,False,/inHgpiOAddL4Y53EvHf9B5lf2Sc.jpg,tt0097462,fr,Guerriers et Captives,"Patagonia, Argentina, 1880s. During the Conque...",0.96,/ytYs98qqVmiY45cxkqNgJbj8uDL.jpg,"['AR', 'FR', 'CH']",1990-09-05,...,,,,,,,"['Drama', 'Adventure'],",Warriors and Prisoners -,"['fr', 'es'],",0.0 /


In [38]:
# Combiner les colonnes 'runtimeMinutes' et 'runtime' en une seule colonne
merged_df['runtime_combined'] = merged_df['runtimeMinutes'].fillna('').astype(str) + ' / ' + merged_df['runtime'].fillna('').astype(str)

# Supprimer les colonnes 'runtimeMinutes' et 'runtime'
merged_df = merged_df.drop(columns=['runtimeMinutes', 'runtime'])

# Afficher les premières lignes pour vérifier
display(merged_df.head())


Unnamed: 0,adult,backdrop_path,tconst,original_language,original_title,overview,popularity,poster_path,production_countries,release_date,...,category,types,startYear,primaryName,birthYear,genres,titles,languages_combined,votes,runtime_combined
0,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,tt0035423,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,['US'],2001-12-25,...,,,,,,"['Romance', 'Fantasy', 'Comedy'],",Kate & Leopold -,"['en', 'fr', 'it'],",6.326 /,/ 118.0
1,,,tt0096222,,,,,,,NaT,...,actor,imdbDisplay,1992.0,Jimmy Lin,1974.0,",Comedy,Drama,Romance",- To Miss with Love,", en",/ 54558.04810794954,90.0 /
2,False,/6tRdTNdcmdBlTGyJ924ISyJDABA.jpg,tt0097106,fr,Conte de printemps,The story of an introverted young girl just r...,9.163,/bP2qpuzQPBBBJa4kOhvLvOOiASq.jpg,['FR'],1990-04-04,...,,,,,,"['Comedy', 'Drama', 'Romance'],",A Tale of Springtime -,"['fr'],",6.9 /,/ 103.0
3,False,/uEPLeurEn2dp54f5owtMC3PeiPN.jpg,tt0097318,en,Eye of the Widow,A new international terrorist group attack the...,0.742,/1Ela9pkCRCPugwasp8hdR1BpfXb.jpg,"['ES', 'FR']",1991-10-17,...,,,,,,"['Crime', 'Action', 'Adventure'],",Eye of the Widow -,"['ar', 'en', 'fr', 'es'],",3.0 /,/ 95.0
4,False,/inHgpiOAddL4Y53EvHf9B5lf2Sc.jpg,tt0097462,fr,Guerriers et Captives,"Patagonia, Argentina, 1880s. During the Conque...",0.96,/ytYs98qqVmiY45cxkqNgJbj8uDL.jpg,"['AR', 'FR', 'CH']",1990-09-05,...,,,,,,"['Drama', 'Adventure'],",Warriors and Prisoners -,"['fr', 'es'],",0.0 /,/ 99.0


In [39]:
# Renommer la colonne 'runtime_combined' en 'runtime'
merged_df = merged_df.rename(columns={'runtime_combined': 'runtime'})

In [40]:
# Combiner les colonnes 'release_year' et 'startYear' en une seule colonne
merged_df['start_year'] = merged_df['release_year'].fillna('').astype(str) + ' - ' + merged_df['startYear'].fillna('').astype(str)

# Supprimer les colonnes 'release_year' et 'startYear'
merged_df = merged_df.drop(columns=['release_year', 'startYear'])

# Afficher les premières lignes pour vérifier
display(merged_df.head(2))


Unnamed: 0,adult,backdrop_path,tconst,original_language,original_title,overview,popularity,poster_path,production_countries,release_date,...,category,types,primaryName,birthYear,genres,titles,languages_combined,votes,runtime,start_year
0,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,tt0035423,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,['US'],2001-12-25,...,,,,,"['Romance', 'Fantasy', 'Comedy'],",Kate & Leopold -,"['en', 'fr', 'it'],",6.326 /,/ 118.0,2001.0 -
1,,,tt0096222,,,,,,,NaT,...,actor,imdbDisplay,Jimmy Lin,1974.0,",Comedy,Drama,Romance",- To Miss with Love,", en",/ 54558.04810794954,90.0 /,- 1992.0


Supression des colonnes 'nconst', 'release_date'

In [41]:
merged_df = merged_df.drop(columns=['nconst', 'release_date'])

In [42]:
# Obtenir les valeurs uniques dans la colonne 'video'
video_unik = merged_df['video'].unique()
print(video_unik)

[False nan True]


In [46]:
#Supression de la colonnes 'video'
#merged_df = merged_df.drop(columns=['video'])

In [50]:
#Supression des colonnes spécifiées
#merged_df = merged_df.drop(columns=['original_language','original_title', 'types'])

In [245]:
# Renommer la colonne 'languages_combined' en 'spoken_languages'
merged_df = merged_df.rename(columns={'languages_combined': 'spoken_languages'})

In [51]:
# Trouver la médiane de la colonne 'averageRating'
median_value = merged_df['averageRating'].median()

# Remplacer les NaN par la médiane
merged_df['averageRating'] = merged_df['averageRating'].fillna(median_value)

# Vérifier que les NaN ont été remplacés
display(merged_df['averageRating'].head())


0    6.400675
1    6.400675
2    6.400675
3    6.400675
4    6.400675
Name: averageRating, dtype: float64

In [52]:
# Trouver la valeur la plus fréquente (mode) dans la colonne 'production_countries'
most_frequent_value = merged_df['production_countries'].mode()[0]

# Remplacer les NaN par la valeur la plus fréquente
merged_df['production_countries'] = merged_df['production_countries'].fillna(most_frequent_value)

# Vérifier que les NaN ont été remplacés
display(merged_df['production_countries'].head())


0                ['US']
1                ['FR']
2                ['FR']
3          ['ES', 'FR']
4    ['AR', 'FR', 'CH']
Name: production_countries, dtype: object

In [53]:
# Trouver la valeur la plus fréquente (mode) dans la colonne 'production_companies_name '
most_companies = merged_df['production_companies_name'].mode()[0]

# Remplacer les NaN par la valeur la plus fréquente
merged_df['production_companies_name'] = merged_df['production_companies_name'].fillna(most_companies)

# Vérifier que les NaN ont été remplacés
display(merged_df['production_companies_name'].head(2))


0    ['Konrad Pictures', 'Miramax']
1                                []
Name: production_companies_name, dtype: object

In [54]:
# Trouver la valeur la plus fréquente (mode) dans la colonne 'category'
most_frequent_category = merged_df['category'].mode()[0]

# Remplacer les NaN par la valeur la plus fréquente
merged_df['category'] = merged_df['category'].fillna(most_frequent_category)

# Vérifier le remplacement
display(merged_df[['category', 'primaryName']].head())


Unnamed: 0,category,primaryName
0,actor,
1,actor,Jimmy Lin
2,actor,
3,actor,
4,actor,


In [55]:
# Remplacer les NaN dans 'primaryName' par 'Unknown'
merged_df['primaryName'] = merged_df['primaryName'].fillna('Unknown')

# Vérifier le remplacement
display(merged_df[['category', 'primaryName']].head())


Unnamed: 0,category,primaryName
0,actor,Unknown
1,actor,Jimmy Lin
2,actor,Unknown
3,actor,Unknown
4,actor,Unknown


In [56]:
#Supression de la colonnes 'ordering'
merged_df = merged_df.drop(columns=['ordering'])

In [57]:
#Supression de la colonnes 'status'
merged_df = merged_df.drop(columns=['status'])

In [58]:
# Obtenir les valeurs uniques dans la colonne 'popularity'
popularity_unik = merged_df['popularity'].nunique()
print(popularity_unik)

2674


In [59]:
#Supression de la colonnes 'popularity'
merged_df = merged_df.drop(columns=['popularity'])

In [61]:
# Calculer la médiane de la colonne 'birthYear'
median_birth_year = merged_df['birthYear'].median()

# Remplacer les NaN par la médiane
merged_df['birthYear'] = merged_df['birthYear'].fillna(median_birth_year)

# Vérifier le remplacement
display(merged_df['birthYear'].head())


0    1981.0
1    1974.0
2    1981.0
3    1981.0
4    1981.0
Name: birthYear, dtype: float64

In [62]:
# Définir une phrase fun et originale
replacement_phrase = "Découvrez cette incroyable aventure et laissez-vous surprendre par chaque tournant inattendu !"

# Remplacer les NaN dans la colonne 'overview'
merged_df['overview'] = merged_df['overview'].fillna(replacement_phrase)

# Vérifier le remplacement
display(merged_df['overview'].head())


0    When her scientist ex-boyfriend discovers a po...
1    Découvrez cette incroyable aventure et laissez...
2    The story of an introverted young girl  just r...
3    A new international terrorist group attack the...
4    Patagonia, Argentina, 1880s. During the Conque...
Name: overview, dtype: object

Pour remplacer les valeurs manquantes dans la colonne 'adult', qui est un booléen indiquant si le film est pour adultes ou non, il est courant de choisir une valeur par défaut. Voici quelques approches possibles :

Remplacer par False (Non adulte) : C'est souvent une approche sûre car la majorité des films ne sont pas pour adultes.
Remplacer par True (Adulte) : Si vous pensez que la majorité des valeurs manquantes sont des films pour adultes (moins courant).
Analyser les données : Si possible, analyser les autres colonnes pour faire une supposition plus éclairée sur les valeurs manquantes.

In [63]:
# Remplacer les NaN dans la colonne 'adult' par False
merged_df['adult'] = merged_df['adult'].fillna(False)

  merged_df['adult'] = merged_df['adult'].fillna(False)


In [64]:
# Trouver les NaN dans tout le DataFrame
nan_counts = merged_df.isna().sum()

# Afficher le nombre de NaN par colonne
print(nan_counts)

adult                           0
backdrop_path                9386
tconst                          0
overview                        0
poster_path                  8449
production_countries            0
production_companies_name       0
averageRating                   0
category                        0
primaryName                     0
birthYear                       0
genres                          0
titles                          0
languages_combined              0
votes                           0
runtime                         0
start_year                      0
dtype: int64


In [65]:
# Supprimer les lignes avec des valeurs NaN dans la colonne 'backdrop_path'
df_clean = merged_df.dropna(subset=['poster_path'])


In [66]:
# Trouver les NaN dans tout le DataFrame
nan_counts = df_clean.isna().sum()

# Afficher le nombre de NaN par colonne
print(nan_counts)

adult                          0
backdrop_path                945
tconst                         0
overview                       0
poster_path                    0
production_countries           0
production_companies_name      0
averageRating                  0
category                       0
primaryName                    0
birthYear                      0
genres                         0
titles                         0
languages_combined             0
votes                          0
runtime                        0
start_year                     0
dtype: int64


In [67]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3930 entries, 0 to 12377
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   adult                      3930 non-null   bool   
 1   backdrop_path              2985 non-null   object 
 2   tconst                     3930 non-null   object 
 3   overview                   3930 non-null   object 
 4   poster_path                3930 non-null   object 
 5   production_countries       3930 non-null   object 
 6   production_companies_name  3930 non-null   object 
 7   averageRating              3930 non-null   float64
 8   category                   3930 non-null   object 
 9   primaryName                3930 non-null   object 
 10  birthYear                  3930 non-null   float64
 11  genres                     3930 non-null   object 
 12  titles                     3930 non-null   object 
 13  languages_combined         3930 non-null   object 
 

In [68]:
# Supprimer la colonne 'backdrop_path'
final_df = df_clean.drop(columns=['backdrop_path'])

In [69]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3930 entries, 0 to 12377
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   adult                      3930 non-null   bool   
 1   tconst                     3930 non-null   object 
 2   overview                   3930 non-null   object 
 3   poster_path                3930 non-null   object 
 4   production_countries       3930 non-null   object 
 5   production_companies_name  3930 non-null   object 
 6   averageRating              3930 non-null   float64
 7   category                   3930 non-null   object 
 8   primaryName                3930 non-null   object 
 9   birthYear                  3930 non-null   float64
 10  genres                     3930 non-null   object 
 11  titles                     3930 non-null   object 
 12  languages_combined         3930 non-null   object 
 13  votes                      3930 non-null   object 
 

In [70]:
# Sauvegarder le DataFrame nettoyé 
final_df.to_csv('C:/Users/Win10/Desktop/projet 2 bdd/table_finale_final_df.csv', index=False)