In [1]:
from datetime import datetime
from datetime import timedelta
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
url = "https://datasets.imdbws.com/title.principals.tsv.gz"
df6 = pd.read_csv(url, sep = '\t')

In [3]:
print(df6.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89385811 entries, 0 to 89385810
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   tconst      object
 1   ordering    int64 
 2   nconst      object
 3   category    object
 4   job         object
 5   characters  object
dtypes: int64(1), object(5)
memory usage: 4.0+ GB
None


In [5]:
display(df6)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0005690,producer,producer,\N
3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N
...,...,...,...,...,...,...
89385806,tt9916880,17,nm0996406,director,principal director,\N
89385807,tt9916880,18,nm1482639,writer,\N,\N
89385808,tt9916880,19,nm2586970,writer,books,\N
89385809,tt9916880,20,nm1594058,producer,producer,\N


In [8]:
# Vérification des valeurs nulles
missing_values = df6.isnull().sum()
print("Valeurs manquantes par colonne :\n", missing_values)

# Vérification des valeurs "\N" dans les colonnes 'job' et 'characters'
job_nulls = (df6['job'] == '\\N').sum()
characters_nulls = (df6['characters'] == '\\N').sum()
print(f"Valeurs '\\N' dans 'job' : {job_nulls}")
print(f"Valeurs '\\N' dans 'characters' : {characters_nulls}")

# Vérification des doublons basés sur les colonnes 'tconst' et 'nconst'
duplicates = df6.duplicated(subset=['tconst', 'nconst'])
print("Nombre de doublons :", duplicates.sum())

# Affichage des doublons s'il y en a
if duplicates.sum() > 0:
    print("Doublons :\n", df6[duplicates])

# Vérification des types de données
print("Types de données :\n", df6.dtypes)




Valeurs manquantes par colonne :
 tconst        0
ordering      0
nconst        0
category      0
job           0
characters    0
dtype: int64
Valeurs '\N' dans 'job' : 72655576
Valeurs '\N' dans 'characters' : 46086359
Nombre de doublons : 8343679
Doublons :
              tconst  ordering     nconst         category       job  \
2         tt0000001         3  nm0005690         producer  producer   
8         tt0000003         3  nm0721526         producer  producer   
20        tt0000007         5  nm0005690         producer  producer   
22        tt0000007         7  nm0374658  cinematographer        \N   
25        tt0000008         3  nm0005690         producer  producer   
...             ...       ...        ...              ...       ...   
89385800  tt9916880        11  nm1052583          actress        \N   
89385802  tt9916880        13  nm2676923          actress        \N   
89385803  tt9916880        14  nm2676923          actress        \N   
89385805  tt9916880        16

In [12]:
df6.dtypes

tconst        object
ordering       int64
nconst        object
category      object
job           object
characters    object
dtype: object

In [19]:
df6.head(15)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0005690,producer,producer,\N
3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N
5,tt0000002,2,nm1335271,composer,\N,\N
6,tt0000003,1,nm0721526,director,\N,\N
7,tt0000003,2,nm1770680,producer,producer,\N
8,tt0000003,3,nm0721526,producer,producer,\N
9,tt0000003,4,nm1335271,composer,\N,\N


Les colonnes "job" et "characters" présentent un nombre significatif de valeurs manquantes représentées par "\N":
•	Colonne "job" : 72,655,576 valeurs manquantes.
•	Colonne "characters" : 46,086,359 valeurs manquantes.
Analyse des doublons
Il y a 8,343,679 doublons dans les données.



In [20]:
# Supprimer les colonnes 'job' et 'characters'
df6_cleaned = df6.drop(columns=['job', 'characters'])

# Sauvegarder le DataFrame nettoyé
#df6_cleaned.to_csv('/mnt/data/cleaned_data_file.csv', index=False)


In [21]:
df6_cleaned.head()


Unnamed: 0,tconst,ordering,nconst,category
0,tt0000001,1,nm1588970,self
1,tt0000001,2,nm0005690,director
2,tt0000001,3,nm0005690,producer
3,tt0000001,4,nm0374658,cinematographer
4,tt0000002,1,nm0721526,director


In [24]:
# Vérification des doublons basés sur les colonnes 'tconst' et 'nconst'
duplicates = df6_cleaned.duplicated(subset=['tconst'])
print("Nombre de doublons :", duplicates.sum())

Nombre de doublons : 79173786


In [27]:
duplicates

0           False
1            True
2            True
3            True
4           False
            ...  
89385806     True
89385807     True
89385808     True
89385809     True
89385810     True
Length: 89385811, dtype: bool

In [None]:
# Afficher le nombre de doublons
print("Nombre de doublons avant suppression :", duplicates.shape[0])

# Supprimer les doublons
df6_cleaned_1 = df6_cleaned.drop_duplicates(subset=['tconst'], keep='first')

# Vérification des doublons après suppression
duplicates_after = df6_cleaned_1[df6_cleaned_1.duplicated(subset=['tconst'], keep=False)]
print("Nombre de doublons après suppression :", duplicates_after.shape[0])




Nombre de doublons avant suppression : 89385811
Nombre de doublons après suppression : 0


In [32]:
df6_cleaned_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10212025 entries, 0 to 89385790
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   tconst    object
 1   ordering  int64 
 2   nconst    object
 3   category  object
dtypes: int64(1), object(3)
memory usage: 389.6+ MB


In [34]:
# Sauvegarder le DataFrame nettoyé (facultatif)
df6_cleaned_1.to_csv('C:/Users/Win10/Desktop/projet 2 bdd/cleaned_data_file.csv', index=False)