In [1]:
from datetime import datetime
from datetime import timedelta
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
url = "https://datasets.imdbws.com/name.basics.tsv.gz"
df1 = pd.read_csv(url, sep = '\t')

In [4]:
df1.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0050419,tt0072308,tt0053137,tt0043044"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13995894 entries, 0 to 13995893
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   nconst             object
 1   primaryName        object
 2   birthYear          object
 3   deathYear          object
 4   primaryProfession  object
 5   knownForTitles     object
dtypes: object(6)
memory usage: 640.7+ MB


In [6]:
#afficher le nombre de valeurs manquantes par colonne
df1.isna().sum()


Unnamed: 0,0
nconst,0
primaryName,9
birthYear,0
deathYear,0
primaryProfession,0
knownForTitles,0


In [7]:
#afficher le nombre de valeurs \N  par colonne
(df1 == '\\N').sum()


Unnamed: 0,0
nconst,0
primaryName,50
birthYear,13363219
deathYear,13758680
primaryProfession,2716840
knownForTitles,1582896


In [8]:
df1.replace('\\N', np.nan, inplace=True)


In [9]:
#remplacer les valeurs \N par des NaN dans tout le DataFrame
df1.isna().sum()


Unnamed: 0,0
nconst,0
primaryName,59
birthYear,13363219
deathYear,13758680
primaryProfession,2716840
knownForTitles,1582896


In [10]:
# nous avons 59 primaryname qui manquent, on va les supprimer, sans nom on ne peut rien faire
df1 = df1[df1['primaryName'].notna()]
df1.shape


(13995835, 6)

In [11]:
#convertir les colonnes dates qui sont en objet au type float
df1['birthYear'] = pd.to_numeric(df1['birthYear'], errors='coerce')
df1['deathYear'] = pd.to_numeric(df1['deathYear'], errors='coerce')


In [12]:
df1[['birthYear', 'deathYear']].dtypes

Unnamed: 0,0
birthYear,float64
deathYear,float64


In [13]:
# Vérifier les doublons dans la colonne 'primaryName'
doublons = df1[df1.duplicated(subset=['primaryName'], keep=False)]

# Afficher les doublons
print(doublons)




             nconst        primaryName  birthYear  deathYear  \
0         nm0000001       Fred Astaire     1899.0     1987.0   
5         nm0000006     Ingrid Bergman     1915.0     1982.0   
7         nm0000008      Marlon Brando     1924.0     2004.0   
8         nm0000009     Richard Burton     1925.0     1984.0   
10        nm0000011        Gary Cooper     1901.0     1961.0   
...             ...                ...        ...        ...   
13995866  nm9993690       David Jewell        NaN        NaN   
13995868  nm9993692      Shiela Martin        NaN        NaN   
13995887  nm9993712    Corny O'Connell        NaN        NaN   
13995889  nm9993714  Romeo del Rosario        NaN        NaN   
13995893  nm9993719         Andre Hill        NaN        NaN   

                            primaryProfession  \
0                actor,miscellaneous,producer   
5                 actress,producer,soundtrack   
7                       actor,director,writer   
8                     actor,produce

In [14]:
# Supprimer les doublons en gardant la première occurrence
df1 = df1.drop_duplicates(subset=['primaryName'], keep='first')

# Vérifier que les doublons ont été supprimés
print(df1.duplicated(subset=['primaryName']).sum())

0


In [38]:
# Créer un nouveau DataFrame sans les lignes où 'primaryProfession' est NaN
df1_cleaned = df1.dropna(subset=['primaryProfession'])

# Vérifier la taille du nouveau DataFrame
print(df1_cleaned.shape)


(4147054, 6)


In [39]:
df1_cleaned = df1_cleaned.dropna(subset=['primaryName'])
print(df1_cleaned.shape)

(4147054, 6)


In [40]:
# Filtrer pour garder uniquement les "actor" et "actress"
df1_cleaned = df1_cleaned[df1_cleaned['primaryProfession'].str.contains('actor|actress', na=False)]
print(df1_cleaned.shape)


(4147054, 6)


In [41]:
df1.reset_index(drop=True, inplace=True)


In [42]:
df1_cleaned.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"actor,miscellaneous,producer","tt0050419,tt0072308,tt0053137,tt0043044"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [43]:
df1_filtered = df1_cleaned[(df1_cleaned['birthYear'] >= 2000) & ((df1_cleaned['deathYear'] >= 2000) | (df1_cleaned['deathYear'].isna()))]
print(df1_filtered.shape)

(9334, 6)


In [44]:
df1 = df1_filtered
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
Index: 9334 entries, 480230 to 13995731
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   nconst             9334 non-null   object  
 1   primaryName        9334 non-null   object  
 2   birthYear          9334 non-null   float64 
 3   deathYear          175 non-null    float64 
 4   primaryProfession  9334 non-null   category
 5   knownForTitles     8910 non-null   category
dtypes: category(2), float64(2), object(2)
memory usage: 82.5+ MB
None
