In [1]:
from datetime import datetime
from datetime import timedelta
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
url = "https://datasets.imdbws.com/name.basics.tsv.gz"
df1 = pd.read_csv(url, sep = '\t')

In [3]:
df1.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0043044"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [4]:
df1.shape

(14013602, 6)

In [5]:
#afficher le nombre de valeurs manquantes par colonne
df1.isna().sum()


Unnamed: 0,0
nconst,0
primaryName,9
birthYear,0
deathYear,0
primaryProfession,0
knownForTitles,0


In [6]:
#afficher le nombre de valeurs \N  par colonne
(df1 == '\\N').sum()


Unnamed: 0,0
nconst,0
primaryName,50
birthYear,13380365
deathYear,13776109
primaryProfession,2720840
knownForTitles,1586468


In [7]:
df1.replace('\\N', np.nan, inplace=True)


In [8]:
#remplacer les valeurs \N par des NaN dans tout le DataFrame
df1.isna().sum()


Unnamed: 0,0
nconst,0
primaryName,59
birthYear,13380365
deathYear,13776109
primaryProfession,2720840
knownForTitles,1586468


In [9]:
# nous avons 59 primaryname qui manquent, on va les supprimer, sans nom on ne peut rien faire
df1 = df1[df1['primaryName'].notna()]
df1.shape


(14013543, 6)

In [10]:
#convertir les colonnes dates qui sont en objet au type float
df1['birthYear'] = pd.to_numeric(df1['birthYear'], errors='coerce')
df1['deathYear'] = pd.to_numeric(df1['deathYear'], errors='coerce')


In [11]:
df1[['birthYear', 'deathYear']].dtypes

Unnamed: 0,0
birthYear,float64
deathYear,float64


In [12]:
# Vérifier les doublons dans la colonne 'primaryName'
doublons = df1[df1.duplicated(subset=['primaryName'], keep=False)]




In [13]:
# Supprimer les doublons en gardant la première occurrence
df1 = df1.drop_duplicates(subset=['primaryName'], keep='first')


In [14]:
# Créer un nouveau DataFrame sans les lignes où 'primaryProfession' est NaN
df1_cleaned = df1.dropna(subset=['primaryProfession'])



In [15]:
df1_cleaned = df1_cleaned.dropna(subset=['primaryName'])


In [16]:
# Séparer les professions dans chaque ligne et extraire les valeurs uniques
professions = df1_cleaned['primaryProfession'].dropna().str.split(',').explode().unique()
print(professions)


['actor' 'miscellaneous' 'producer' 'actress' 'soundtrack'
 'archive_footage' 'music_department' 'writer' 'director' 'stunts'
 'make_up_department' 'composer' 'assistant_director' 'camera_department'
 'music_artist' 'art_department' 'editor' 'cinematographer'
 'casting_director' 'executive' 'visual_effects' 'costume_designer'
 'script_department' 'editorial_department' 'costume_department'
 'animation_department' 'talent_agent' 'archive_sound'
 'production_designer' 'special_effects' 'production_manager'
 'art_director' 'sound_department' 'casting_department'
 'location_management' 'set_decorator' 'transportation_department'
 'choreographer' 'legal' 'manager' 'podcaster' 'publicist' 'assistant'
 'production_department' 'accountant' 'electrical_department']


In [17]:
# Tri des professions
sorted_professions = sorted(professions)
print(sorted_professions)


['accountant', 'actor', 'actress', 'animation_department', 'archive_footage', 'archive_sound', 'art_department', 'art_director', 'assistant', 'assistant_director', 'camera_department', 'casting_department', 'casting_director', 'choreographer', 'cinematographer', 'composer', 'costume_department', 'costume_designer', 'director', 'editor', 'editorial_department', 'electrical_department', 'executive', 'legal', 'location_management', 'make_up_department', 'manager', 'miscellaneous', 'music_artist', 'music_department', 'podcaster', 'producer', 'production_department', 'production_designer', 'production_manager', 'publicist', 'script_department', 'set_decorator', 'sound_department', 'soundtrack', 'special_effects', 'stunts', 'talent_agent', 'transportation_department', 'visual_effects', 'writer']


In [18]:
profession_counts = df1_cleaned['primaryProfession'].dropna().str.split(',').explode().value_counts()
print(profession_counts)


primaryProfession
actor                        2498014
actress                      1654457
miscellaneous                1110433
producer                      945714
writer                        739241
director                      594697
camera_department             592474
art_department                384418
cinematographer               299080
sound_department              292132
composer                      276814
editor                        276677
music_department              229983
assistant_director            212115
visual_effects                191017
make_up_department            190233
animation_department          181255
production_manager            175770
archive_footage               161448
editorial_department          146923
costume_department            135818
soundtrack                    126723
transportation_department      81051
art_director                   77237
script_department              73371
stunts                         72284
location_management 

In [19]:
# Filtrer pour garder uniquement les "actor", "actress" et "director"
df1_cleaned = df1_cleaned[df1_cleaned['primaryProfession'].str.contains('actor|actress|director', na=False)]



In [20]:
print(df1_cleaned.head())
print(df1_cleaned.shape)  # Nombre de lignes et colonnes après le filtrage


      nconst      primaryName  birthYear  deathYear  \
0  nm0000001     Fred Astaire     1899.0     1987.0   
1  nm0000002    Lauren Bacall     1924.0     2014.0   
2  nm0000003  Brigitte Bardot     1934.0        NaN   
3  nm0000004     John Belushi     1949.0     1982.0   
4  nm0000005   Ingmar Bergman     1918.0     2007.0   

                    primaryProfession                           knownForTitles  
0        actor,miscellaneous,producer  tt0072308,tt0050419,tt0053137,tt0043044  
1  actress,soundtrack,archive_footage  tt0037382,tt0075213,tt0117057,tt0038355  
2   actress,music_department,producer  tt0057345,tt0049189,tt0056404,tt0054452  
3       actor,writer,music_department  tt0072562,tt0077975,tt0080455,tt0078723  
4               writer,director,actor  tt0050986,tt0069467,tt0050976,tt0083922  
(4832802, 6)


In [21]:
df1_cleaned.reset_index(drop=True, inplace=True)


In [22]:
# Filtrer les lignes selon les critères
df1_filtered = df1_cleaned[
    (df1_cleaned['birthYear'] > 1970) &
    ((df1_cleaned['deathYear'] < 1990) | (df1_cleaned['deathYear'].isna())) &
    (df1_cleaned['birthYear'].notna())
]

# Afficher les premières lignes pour vérification
print(df1_filtered.head())
print(f"Nombre de lignes après filtrage : {df1_filtered.shape[0]}")


        nconst        primaryName  birthYear  deathYear  \
99   nm0000103       Fairuza Balk     1974.0        NaN   
102  nm0000106     Drew Barrymore     1975.0        NaN   
113  nm0000117      Neve Campbell     1973.0        NaN   
127  nm0000132       Claire Danes     1979.0        NaN   
133  nm0000138  Leonardo DiCaprio     1974.0        NaN   

                      primaryProfession  \
99   actress,soundtrack,archive_footage   
102           producer,actress,director   
113             actress,producer,writer   
127         actress,producer,soundtrack   
133               producer,actor,writer   

                              knownForTitles  
99   tt0115963,tt0181875,tt0089908,tt0120586  
102  tt0160127,tt0151738,tt0343660,tt0120631  
113  tt0117571,tt0120082,tt0134084,tt0120890  
127  tt0108872,tt1796960,tt0117509,tt0274558  
133  tt1375666,tt0993846,tt0407887,tt0120338  
Nombre de lignes après filtrage : 144702


In [23]:
df1_filtered.head(30)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
99,nm0000103,Fairuza Balk,1974.0,,"actress,soundtrack,archive_footage","tt0115963,tt0181875,tt0089908,tt0120586"
102,nm0000106,Drew Barrymore,1975.0,,"producer,actress,director","tt0160127,tt0151738,tt0343660,tt0120631"
113,nm0000117,Neve Campbell,1973.0,,"actress,producer,writer","tt0117571,tt0120082,tt0134084,tt0120890"
127,nm0000132,Claire Danes,1979.0,,"actress,producer,soundtrack","tt0108872,tt1796960,tt0117509,tt0274558"
133,nm0000138,Leonardo DiCaprio,1974.0,,"producer,actor,writer","tt1375666,tt0993846,tt0407887,tt0120338"
134,nm0000139,Cameron Diaz,1972.0,,"actress,producer,soundtrack","tt0259711,tt0129387,tt0160127,tt0119738"
165,nm0000170,Milla Jovovich,1975.0,,"actress,producer,director","tt0119116,tt1220634,tt0120804,tt1220198"
174,nm0000179,Jude Law,1972.0,,"actor,producer,director","tt0134119,tt0376541,tt2278388,tt0346156"
184,nm0000189,Jenny McCarthy-Wahlberg,1972.0,,"actress,producer,writer","tt0327643,tt0455967,tt0134084,tt0131857"
186,nm0000191,Ewan McGregor,1971.0,,"actor,producer,writer","tt0203009,tt0117951,tt1322269,tt2763304"


In [25]:
# Convertir 'birthYear' en datetime, en prenant l'année comme une date de début de l'année
df1['birthYear'] = pd.to_datetime(df1['birthYear'], format='%Y', errors='coerce')

# Convertir 'deathYear' en datetime, en prenant l'année comme une date de fin de l'année (si elle existe)
df1['deathYear'] = pd.to_datetime(df1['deathYear'], format='%Y', errors='coerce')

# Vérifier le résultat
print(df1.info())


<class 'pandas.core.frame.DataFrame'>
Index: 144702 entries, 99 to 4832791
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   nconst             144702 non-null  object        
 1   primaryName        144702 non-null  object        
 2   birthYear          144702 non-null  datetime64[ns]
 3   deathYear          9 non-null       datetime64[ns]
 4   primaryProfession  144702 non-null  object        
 5   knownForTitles     143491 non-null  object        
dtypes: datetime64[ns](2), object(4)
memory usage: 7.7+ MB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['birthYear'] = pd.to_datetime(df1['birthYear'], format='%Y', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['deathYear'] = pd.to_datetime(df1['deathYear'], format='%Y', errors='coerce')


In [26]:
df1 = df1_filtered
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
Index: 144702 entries, 99 to 4832791
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   nconst             144702 non-null  object        
 1   primaryName        144702 non-null  object        
 2   birthYear          144702 non-null  datetime64[ns]
 3   deathYear          9 non-null       datetime64[ns]
 4   primaryProfession  144702 non-null  object        
 5   knownForTitles     143491 non-null  object        
dtypes: datetime64[ns](2), object(4)
memory usage: 7.7+ MB
None
