In [None]:
import requests
import gzip
import io
import os
import shutil
import pandas as pd

In [None]:
def download_tsv(suffix, file_name):
    outdir = './raw_output'
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    fullname = os.path.join(outdir, file_name + ".tsv") 
    url = "https://datasets.imdbws.com/" + suffix
    response = requests.get(url)
    gzipped_content = response.content
    with gzip.open(io.BytesIO(gzipped_content), 'rt', encoding='utf-8') as f_in:
        with open(fullname, 'w', encoding='utf-8') as f_out:
            shutil.copyfileobj(f_in, f_out)
    return fullname

In [None]:
download_tsv("title.akas.tsv.gz", "title_akas")
download_tsv("title.basics.tsv.gz","title_basics")
download_tsv("title.episode.tsv.gz","title_episode")
download_tsv("title.principals.tsv.gz", "title_principals")
download_tsv("title.ratings.tsv.gz", "title_ratings")
download_tsv("title.crew.tsv.gz", "title_crew")
download_tsv("name.basics.tsv.gz", "name_basics")

In [None]:
df_title_basics = pd.read_csv('./raw_output/title_basics.tsv', delimiter='\t', encoding='UTF-8')
df_title_ratings = pd.read_csv('./raw_output/title_ratings.tsv', delimiter='\t', encoding='UTF-8')
df_name_basics = pd.read_csv('./raw_output/name_basics.tsv', delimiter='\t', encoding='UTF-8')
df_title_principals = pd.read_csv('./raw_output/title_principals.tsv', delimiter='\t', encoding='UTF-8')

In [None]:
film_filter = (df_title_basics['titleType'] == 'movie') & (df_title_basics['isAdult'] == 0 ) & (df_title_basics['startYear'] != '\\N' ) & (df_title_basics['runtimeMinutes'] != '\\N' ) & (df_title_basics['genres'] != '\\N' )

df_films_only = df_title_basics.loc[film_filter]

In [None]:
df_films = pd.merge(df_films_only, df_title_ratings, 'inner', on='tconst')

In [None]:
df_genres = df_films[['tconst', 'genres']]

df_genres = (df_genres.set_index(['tconst'])
   .apply(lambda x: x.str.split(',').explode())
   .reset_index())

In [None]:
df_films.drop(['titleType', 'originalTitle', 'isAdult', 'endYear', 'genres'], axis=1, inplace=True)

df_films.rename(columns={"tconst": "filmId", "primaryTitle": "filmName", "startYear": "yearReleased"}, inplace=True)

In [None]:
cast_filter = (df_title_principals['category'].isin(['director', 'actor', 'actress', 'writer'])) & (df_title_principals['tconst'].isin(df_films['filmId']))

df_film_cast = df_title_principals.loc[cast_filter]

In [None]:
df_film_cast.drop(['ordering', 'job', 'characters'], axis=1, inplace=True)

df_film_cast.rename(columns={"tconst": "filmId", "nconst": "castCrewId"}, inplace=True)

In [None]:
people_filter = (df_name_basics['nconst'].isin(df_film_cast['castCrewId']))

df_cast = df_name_basics.loc[people_filter]

In [None]:
df_cast.drop(['primaryProfession', 'knownForTitles'], axis=1, inplace=True)

df_cast.rename(columns={"nconst": "castCrewId", "primaryName": "castCrewName"}, inplace=True)

In [None]:
outdir = './output'
if not os.path.exists(outdir):
    os.mkdir(outdir)

df_films.to_csv(os.path.join(outdir, "films.csv"), index=False)
df_genres.to_csv(os.path.join(outdir, "genres.csv"), index=False)
df_cast.to_csv(os.path.join(outdir, "people.csv"), index=False)
df_film_cast.to_csv(os.path.join(outdir, "films_people.csv"), index=False)