In [30]:
from datetime import datetime
from datetime import timedelta
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [31]:
url = "https://datasets.imdbws.com/title.basics.tsv.gz"
df3 = pd.read_csv(url, sep = '\t', low_memory=False)

In [None]:
display(df3)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
11280984,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
11280985,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
11280986,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
11280987,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [32]:
# Vérifier les valeurs manquantes dans chaque colonne
print(df3.isnull().sum())

tconst              0
titleType           0
primaryTitle       19
originalTitle      19
isAdult             0
startYear           0
endYear             0
runtimeMinutes      0
genres            633
dtype: int64


In [33]:
# Filtrer les films uniquement (titleType == 'movie')
df_films = df3[df3['titleType'] == 'movie']

In [98]:
# Vérifier le nombre de NaN avant le remplacement
nan_before_runtime = df_films['runtimeMinutes'].isna().sum()
nan_before_startYear = df_films['startYear'].isna().sum()
nan_before_endYear = df_films['endYear'].isna().sum()

print(f"NaN avant remplacement (runtimeMinutes) : {nan_before_runtime}")
print(f"NaN avant remplacement (startYear) : {nan_before_startYear}")
print(f"NaN avant remplacement (endYear) : {nan_before_endYear}")

# Remplacer les valeurs '\N' par NaN
df_films.loc[:, 'runtimeMinutes'] = df_films['runtimeMinutes'].replace(r'\\N', np.nan, regex=True)
df_films.loc[:, 'startYear'] = df_films['startYear'].replace(r'\\N', np.nan, regex=True)
df_films.loc[:, 'endYear'] = df_films['endYear'].replace(r'\\N', np.nan, regex=True)

# Vérifier le nombre de NaN après le remplacement
nan_after_runtime = df_films['runtimeMinutes'].isna().sum()
nan_after_startYear = df_films['startYear'].isna().sum()
nan_after_endYear = df_films['endYear'].isna().sum()

print(f"NaN après remplacement (runtimeMinutes) : {nan_after_runtime}")
print(f"NaN après remplacement (startYear) : {nan_after_startYear}")
print(f"NaN après remplacement (endYear) : {nan_after_endYear}")


NaN avant remplacement (runtimeMinutes) : 259209
NaN avant remplacement (startYear) : 101608
NaN avant remplacement (endYear) : 699357
NaN après remplacement (runtimeMinutes) : 259209
NaN après remplacement (startYear) : 101608
NaN après remplacement (endYear) : 699357


  df_films.loc[:, 'endYear'] = df_films['endYear'].replace(r'\\N', np.nan, regex=True)


In [99]:
# Convertir les colonnes startYear, endYear et runtimeMinutes en numériques avec .loc
df_films.loc[:, 'startYear'] = pd.to_numeric(df_films['startYear'], errors='coerce')
df_films.loc[:, 'endYear'] = pd.to_numeric(df_films['endYear'], errors='coerce')
df_films.loc[:, 'runtimeMinutes'] = pd.to_numeric(df_films['runtimeMinutes'], errors='coerce')


In [100]:
#Garder uniquement les films dont l'année de début est supérieure ou égale à 1990.
#Garder les films qui ont soit une durée supérieure à 60 minutes, soit une valeur manquante (NaN) dans cette colonne.
df3_filtered = df_films[(df_films['startYear'] >= 1990) & ((df_films['runtimeMinutes'] > 60) | (df_films['runtimeMinutes'].isna()))]


In [101]:
print(f"Nombre de films avant filtrage : {df_films.shape[0]}")
print(f"Nombre de films après filtrage : {df3_filtered.shape[0]}")


Nombre de films avant filtrage : 699357
Nombre de films après filtrage : 340488


In [103]:
# Supprimer la colonne isAdult
df3_filtered.drop(columns=['isAdult'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3_filtered.drop(columns=['isAdult'], inplace=True)


In [104]:
print(df3_filtered.columns)


Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'startYear',
       'runtimeMinutes', 'genres'],
      dtype='object')


In [105]:
# Vérifier les valeurs manquantes dans chaque colonne
print(df3_filtered.isnull().sum())

tconst                0
titleType             0
primaryTitle          2
originalTitle         2
startYear             0
runtimeMinutes    91140
genres                0
dtype: int64


In [106]:
# Supprimer les lignes où primaryTitle est nul
df3_filtered = df3_filtered.dropna(subset=['primaryTitle'])

In [107]:
df3_filtered.shape

(340486, 7)

In [108]:
display(df3_filtered)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
11632,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,2019.0,,"Action,Crime"
15480,tt0015724,movie,Dama de noche,Dama de noche,1993.0,102.0,"Drama,Mystery,Romance"
34795,tt0035423,movie,Kate & Leopold,Kate & Leopold,2001.0,118.0,"Comedy,Fantasy,Romance"
37410,tt0038086,movie,Shiva und die Galgenblume,Shiva und die Galgenblume,1993.0,,Thriller
57967,tt0059097,movie,Just Don't Think I'll Cry,"Denk bloß nicht, ich heule",1990.0,91.0,Drama
...,...,...,...,...,...,...,...
11280787,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,2019.0,,"Adventure,History,War"
11280839,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019.0,123.0,Drama
11280907,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,2007.0,100.0,Documentary
11280919,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,2013.0,,Comedy


In [114]:
# Convertir startYear en type datetime (année uniquement)
df3_filtered['startYear'] = pd.to_datetime(df3_filtered['startYear'], format='%Y').dt.year

# Vérifier le résultat
print(df3_filtered['startYear'].head())


11632    2019
15480    1993
34795    2001
37410    1993
57967    1990
Name: startYear, dtype: int32


In [116]:
# Convertir 'runtimeMinutes' en entier (en retirant les décimales)
df3_filtered['runtimeMinutes'] = df3_filtered['runtimeMinutes'].astype('Int64')



In [117]:
display(df3_filtered)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
11632,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,2019,,"Action,Crime"
15480,tt0015724,movie,Dama de noche,Dama de noche,1993,102,"Drama,Mystery,Romance"
34795,tt0035423,movie,Kate & Leopold,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance"
37410,tt0038086,movie,Shiva und die Galgenblume,Shiva und die Galgenblume,1993,,Thriller
57967,tt0059097,movie,Just Don't Think I'll Cry,"Denk bloß nicht, ich heule",1990,91,Drama
...,...,...,...,...,...,...,...
11280787,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,2019,,"Adventure,History,War"
11280839,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123,Drama
11280907,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,2007,100,Documentary
11280919,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,2013,,Comedy


In [118]:
# Compter les NaN dans toutes les colonnes du DataFrame
nan_counts_all_columns = df3_filtered.isna().sum()

# Afficher le nombre de NaN pour chaque colonne
print(nan_counts_all_columns)


tconst                0
titleType             0
primaryTitle          0
originalTitle         0
startYear             0
runtimeMinutes    91139
genres                0
dtype: int64
