In [1]:
import pandas

# Gather Data

[Download datasets from IMDB.](https://datasets.imdbws.com/)

Below is an extract of IMDB's [documentation](https://www.imdb.com/interfaces/).

**title.basics.tsv.gz** - Contains the following information for titles:

- tconst (string) - alphanumeric unique identifier of the title
- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
- originalTitle (string) - original title, in the original language
- isAdult (boolean) - 0: non-adult title; 1: adult title
- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
- runtimeMinutes – primary runtime of the title, in minutes
- genres (string array) – includes up to three genres associated with the title

**title.akas.tsv.gz** - Contains the following information for titles:

- titleId (string) - a tconst, an alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- title (string) – the localized title
- region (string) - the region for this version of the title
- language (string) - the language of the title
- types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
- attributes (array) - Additional terms to describe this alternative title, not enumerated
- isOriginalTitle (boolean) – 0: not original title; 1: original title

**title.principals.tsv.gz** – Contains the principal cast/crew for titles

- tconst (string) - alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- nconst (string) - alphanumeric unique identifier of the name/person
- category (string) - the category of job that person was in
- job (string) - the specific job title if applicable, else '\N'
- characters (string) - the name of the character played if applicable, else '\N'

**name.basics.tsv.gz** – Contains the following information for names:
- nconst (string) - alphanumeric unique identifier of the name/person
- primaryName (string)– name by which the person is most often credited
- birthYear – in YYYY format
- deathYear – in YYYY format if applicable, else '\N'
- primaryProfession (array of strings)– the top-3 professions of the person
- knownForTitles (array of tconsts) – titles the person is known for

In [2]:
title_basics = pandas.read_csv("data/title.basics.tsv", delimiter="\t").set_index("tconst")
title_basics = title_basics[(title_basics["titleType"] == "movie") & (title_basics["isAdult"] == 0)]
title_basics.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(595129, 8)

In [3]:
title_akas = pandas.read_csv("data/title.akas.tsv", delimiter="\t")
title_akas = title_akas[(title_akas["region"] == "FR") & (title_akas["types"] == "imdbDisplay")].set_index("titleId")
title_akas.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(141772, 7)

In [4]:
title_principals = pandas.read_csv("data/title.principals.tsv", delimiter="\t")
title_principals = title_principals[title_principals["category"] == "actor"]
title_principals.shape

(11085656, 6)

In [5]:
name_basics = pandas.read_csv("data/name.basics.tsv", delimiter="\t").set_index("nconst")
name_basics.shape

(11549682, 5)

In [6]:
merged = pandas.merge(
    left=pandas.merge(
        left=title_principals[["tconst", "nconst", "characters"]],
        right=name_basics[["primaryName", "birthYear", "deathYear"]],
        left_on="nconst",
        right_index=True
    ),
    right=pandas.merge(
        left=title_basics[["primaryTitle", "startYear", "genres"]],
        right=title_akas[["title"]],
        left_index=True,
        right_index=True
    ),
    left_on="tconst",
    right_index=True
)
merged.shape

(117986, 10)

In [7]:
merged

Unnamed: 0,tconst,nconst,characters,primaryName,birthYear,deathYear,primaryTitle,startYear,genres,title
89929,tt0012927,nm0179163,"[""Panatella""]",James J. Corbett,1866,1933,The Beauty Shop,1922,Comedy,Charlatan
89927,tt0012927,nm0386879,"[""Dr. Arbutus Budd""]",Raymond Hitchcock,1865,1929,The Beauty Shop,1922,Comedy,Charlatan
89928,tt0012927,nm0888398,"[""Sobini""]",Billy B. Van,1878,1950,The Beauty Shop,1922,Comedy,Charlatan
151789,tt0020403,nm0183823,"[""The Minister - Guillotine Sequence""]",William Courtenay,1875,1933,Show of Shows,1929,Musical,La revue des revues
151791,tt0020403,nm0098376,"[""Executioner - Guillotine Sequence""]",Hobart Bosworth,1867,1943,Show of Shows,1929,Musical,La revue des revues
...,...,...,...,...,...,...,...,...,...,...
49682432,tt9850378,nm7105632,"[""Doctor""]",Dmitriy Shadrin,\N,\N,Man'najgy taptal,2015,Comedy,Premier amour
49735986,tt9868718,nm0546278,"[""Man Ray""]",Marc Marder,\N,\N,Man Ray et les équations shakespeariennes,\N,Documentary,Man Ray et les équations shakespeariennes
49762194,tt9879926,nm1300879,"[""Presentator""]",Philippe Manoeuvre,1954,\N,Rebel anthology,2012,Documentary,Rebel anthology
49808020,tt9896284,nm11380141,"[""David""]",James Hemerson,\N,\N,Who are you?,\N,Drama,Who are you?


In [8]:
merged.to_csv("data.csv", index=False)