In [1]:
import pickle
from multiprocessing.pool import Pool

import imdb
import numpy as np
import pandas as pd

with open("data/actors.pkl", "rb") as f:
    df = pd.DataFrame(sorted(pickle.load(f)), columns=["nconst"])

def get_country(nconst):
    ia = imdb.IMDb()
    person = ia.get_person(nconst[2:])
    birth_place = ""
    country = ""
    if "birth info" in person.data:
        birth_place = person.data["birth info"]["birth place"]
        country = birth_place.split(",")[-1].strip()
        country = country.split("now")[-1].strip()
        country = "".join([ch for ch in country if ch.isalpha() or ch in {" ", "-"}])
    print(country + "|" + birth_place)
    return country

def get_countries(df):
    df["birth_country"] = df["nconst"].apply(get_country)
    return df

def parallelize_dataframe(df, func, n_cores=32):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [2]:
df = parallelize_dataframe(df, get_countries)

USA|Omaha, Nebraska, USA
USA|Helena, Montana, USA
UK|Pontrhydyfen, Wales, UK
Sweden|Stockholm, Sweden
USA|New York City, New York, USA
USA|Lowell, Massachusetts, USA
USA|The Bronx, New York City, New York, USA
USA|New York City, New York, USA
France|Paris, France
USA|Omaha, Nebraska, USA


In [3]:
df.to_pickle("data/birth_countries.pkl")
df

Unnamed: 0,nconst,birth_country
0,nm0000001,USA
1,nm0000002,USA
2,nm0000003,France
3,nm0000006,Sweden
4,nm0000007,USA
5,nm0000008,USA
6,nm0000009,UK
7,nm0000010,USA
8,nm0000011,USA
9,nm0000012,USA
