In [None]:
# importation des bibliothèques
import requests
import pandas as pd
import os
import sqlite3, sqlalchemy
from tqdm.notebook import trange, tqdm
from typing import List, Optional
from sqlalchemy import Table, Column, Integer, String, ForeignKey, MetaData, create_engine, text, inspect, Float

# liste des urls pour chaque fichier à telecharger
urls = ['https://datasets.imdbws.com/name.basics.tsv.gz',
       'https://datasets.imdbws.com/title.basics.tsv.gz', 
       'https://datasets.imdbws.com/title.episode.tsv.gz', 
       'https://datasets.imdbws.com/title.principals.tsv.gz', 
       'https://datasets.imdbws.com/title.ratings.tsv.gz']

# telechargement des fichiers
for url in urls:

    filename = url.split('/')[-1]
    target_path = os.path.join("..", "data", "raw", filename)

    response = requests.get(url, stream=True)  
   
    if response.status_code == 200:
        with open(target_path, 'wb') as f:
            f.write(response.raw.read())
            
    for i in trange(1, desc='Statut'):
        print('Fichier téléchargé :', target_path)

In [None]:
# création de la base de données newIMDB.db

db_path = "../data/db/newIMDB.db"

# Suppression de l'ancienne base de données si elle existe
if os.path.exists(db_path):
    os.remove(db_path)

engine = create_engine(f"sqlite:///{db_path}", echo=False)
meta = MetaData()

In [None]:
from sqlalchemy.engine import Engine

def load_table_from_tsv(
    engine: Engine,
    table,
    file_path: str,
    drop_columns: Optional[List[str]] = None,
    rename_columns: Optional[dict] = None,
    sep: str = "\t",
):
    """
    Generic loader for IMDb TSV tables into SQLite.

    Parameters
    ----------
    engine : sqlalchemy Engine
        SQLite engine
    table : sqlalchemy Table
        Target SQLAlchemy table definition
    file_path : str
        Path to .tsv or .tsv.gz file
    drop_columns : list[str], optional
        Columns to drop before insert
    rename_columns : dict, optional
        Column renaming mapping
    sep : str
        Field separator (default: tab)
    """

    # Lire TSV et convertir \N → NULL
    df = pd.read_csv(
        file_path,
        sep=sep,
        na_values=["\\N"],
        low_memory=False
    )

    # Drop ou renommer des colonnes si nécessaire
    if drop_columns:
        for col in drop_columns:
            if col in df.columns:
                df.pop(col)

    if rename_columns:
        df.rename(columns=rename_columns, inplace=True)

    # Convertir DataFrame en liste de tuples
    values = df.to_records(index=False).tolist()

    if not values:
        return  # rien à insérer

    # Créer la table si elle n'existe pas
    table.metadata.create_all(engine)

    # Bulk insert with transaction
    with engine.connect() as connection:
        with connection.begin():
            print(len(values), "rows to insert into", table.name)
            markers = ",".join("?" * len(values[0]))
            sql = f"INSERT OR REPLACE INTO {table.name} VALUES ({markers})"
            connection.execute(sql, values)

    # Libérer la mémoire 
    df = None
    values = None


In [None]:
from sqlalchemy import Table, Column, String, Integer, Float, MetaData

meta = MetaData()

#Creation des tables   
ratings = Table(
    "ratings", meta,
    Column("title_id", String, primary_key=True),
    Column("rating", Float),
    Column("votes", Integer),
    extend_existing=True
)

crew = Table(
    "crew", meta,
    Column("title_id", String , primary_key=True),
    Column("person_id", String, primary_key=True),
    Column("category", String),
    Column("job", String),
    Column("characters", String)
)

episodes = Table(
    "episodes", meta,
    Column("episode_title_id", String, primary_key=True),
    Column("show_title_id", String),
    Column("season_number", Integer),
    Column("episode_number", Integer)
)

titles = Table(
    "titles", meta,
    Column("title_id", String, primary_key=True),
    Column("type", String),
    Column("primary_title", String),
    Column("original_title", String),
    Column("is_adult", Integer),
    Column("premiered", Integer),
    Column("ended", Integer),
    Column("runtime_minutes", Integer),
    Column("genres", String)
)

people = Table(
    "people", meta,
    Column("person_id", String, primary_key=True),
    Column("name", String),
    Column("born", String),
    Column("died", String)
)


In [None]:
################################################## title.ratings.tsv.gz ##################################################
load_table_from_tsv(
    engine,
    ratings,
    "../data/raw/title.ratings.tsv.gz"
)


1627720 rows to insert into ratings


In [None]:
################################################## title.principals.tsv.gz ##################################################
load_table_from_tsv(
    engine,
    crew,
    "../data/raw/title.principals.tsv.gz",
    drop_columns=["ordering"]
)


97390779 rows to insert into crew


In [None]:
################################################## title.episode.tsv.gz ##################################################

load_table_from_tsv(
    engine,
    episodes,
    "../data/raw/title.episode.tsv.gz",
    rename_columns={
        "tconst": "episode_title_id",
        "parentTconst": "show_title_id"
    }
)


9438586 rows to insert into episodes


In [None]:
################################################## title.basics.tsv.gz ##################################################
load_table_from_tsv(
    engine,
    titles,
    "../data/raw/title.basics.tsv.gz",
    rename_columns={
        "tconst": "title_id",
        "titleType": "type",
        "primaryTitle": "primary_title",
        "originalTitle": "original_title",
        "startYear": "premiered",
        "endYear": "ended",
        "runtimeMinutes": "runtime_minutes"
    }
)


12233603 rows to insert into titles


In [None]:
################################################## name.basics.tsv.gz ##################################################
load_table_from_tsv(
    engine,
    people,
    "../data/raw/name.basics.tsv.gz",
    drop_columns=["primaryProfession", "knownForTitles"],
    rename_columns={
        "nconst": "person_id",
        "primaryName": "name",
        "birthYear": "born",
        "deathYear": "died"
    }
)


15031057 rows to insert into people


In [40]:
# affichage du noms des tables dans la base de données newIMDB.db
inspector = inspect(engine)
inspector.get_table_names()

['crew', 'episodes', 'people', 'ratings', 'titles']

In [None]:
# test de requete SQL

engineIMDB = create_engine('sqlite:///../data/db/newIMDB.db')
connIMDB = engineIMDB.connect()

result = connIMDB.execute("SELECT ended FROM titles")
result.fetchall()

[(None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
