In [11]:
import pandas as pd
import pickle
import requests
import numpy as np

In [12]:
pickle_folder = '../data/pickles/'

characters = pickle.load(open(pickle_folder + 'characters.p','rb'))
characters.head()

Unnamed: 0,movie_wiki_id,movie_freebase_id,release_date,character_name,birth,gender,height,ethnicity,actor_name,actor_age,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id,release_year,birth_year
0,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26 00:00:00+00:00,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,2001.0,1958.0
1,975900.0,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15 00:00:00+00:00,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,2001.0,1974.0
2,975900.0,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15 00:00:00+00:00,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,2001.0,1969.0
3,975900.0,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12 00:00:00+00:00,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,2001.0,1967.0
4,975900.0,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25 00:00:00+00:00,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,2001.0,1977.0


In [13]:
freebase_to_wikidata = pickle.load(open(pickle_folder + 'freebase_to_wikidata.p','rb'))
freebase_to_wikidata.head()

Unnamed: 0,freebase_id,wikidata_id
0,/m/0695j,Q6718
1,/m/05nrg,Q538
2,/m/0jgd,Q414
3,/m/0d_23,Q2537
4,/m/04g7d,Q315


### Here I will merge the characters table with the freebase_to_wikidata table. "left_on" is the column with the freebase_id I want to convert to wikidata_id. It's a good idea to compare the length of the dataset before and after the merge.

In [14]:
print(f'Length before merge: {len(characters)}')
characters_with_wikidataID = pd.merge(left=characters, right=freebase_to_wikidata, left_on='movie_freebase_id', right_on='freebase_id', how='left')
print(f'Length after merge: {len(characters_with_wikidataID)}')
print(f'Percentage of successful fit: {sum(characters_with_wikidataID.movie_freebase_id == characters_with_wikidataID.freebase_id)/len(characters_with_wikidataID)}%')
characters_with_wikidataID.head()

Length before merge: 450263
Length after merge: 450263
Percentage of successful fit: 0.6935524349102635%


Unnamed: 0,movie_wiki_id,movie_freebase_id,release_date,character_name,birth,gender,height,ethnicity,actor_name,actor_age,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id,release_year,birth_year,freebase_id,wikidata_id
0,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26 00:00:00+00:00,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,2001.0,1958.0,/m/03vyhn,Q261700
1,975900.0,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15 00:00:00+00:00,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,2001.0,1974.0,/m/03vyhn,Q261700
2,975900.0,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15 00:00:00+00:00,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,2001.0,1969.0,/m/03vyhn,Q261700
3,975900.0,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12 00:00:00+00:00,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,2001.0,1967.0,/m/03vyhn,Q261700
4,975900.0,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25 00:00:00+00:00,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,2001.0,1977.0,/m/03vyhn,Q261700


### Then, if you want to get the english name of each wikidataID, you can send get requests to wikidata.org
First, you get all the unique ids

In [5]:
unique_movie_ids = pd.DataFrame(characters_with_wikidataID['wikidata_id'].unique(), columns=['wikidata_id'])
print(f'Number of unique ids: {len(unique_movie_ids)}')
unique_movie_ids.head()

Number of unique ids: 38802


Unnamed: 0,wikidata_id
0,Q261700
1,
2,Q4978832
3,Q869644
4,Q2559560


You then send GET requests, **but it can be very slow**.
Here, I will only do it on a sample of 100 ids, and will *timeit* one time one loop

In [6]:
sample_unique_movie_ids = unique_movie_ids.sample(100)
def get_wikidata_title(id):
    json_id = requests.get(f'https://www.wikidata.org/wiki/Special:EntityData/{id}.json').json()
    # return json_id['entities'][id]['labels']['en']['value']
    return [json_id['entities'][x]['labels']['en']['value'] for x in json_id['entities']][0] #fixes redirection problems

%timeit -r 1 -n 1 sample_unique_movie_ids['movie_name']  = sample_unique_movie_ids['wikidata_id'].apply(lambda id: get_wikidata_title(id) if pd.notnull(id) else np.nan)

47.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


My results were around 0.4 second per requests

In [7]:
sample_unique_movie_ids.sample(10)

Unnamed: 0,wikidata_id,movie_name
10988,Q7659293,Sword in the Desert
36430,Q994421,The Wing or the Thigh
3675,Q3010562,Going Bye-Bye!
28024,Q3398194,Porky's Duck Hunt
13142,Q369908,La Fille du Puisatier
6379,Q1951430,Mr. Moto's Gamble
1746,Q963574,Shadows of Time
19367,Q126796,Brave
18019,Q2135682,The Donovan Affair
37008,Q399647,Red Desert


Now lets look at unsuccessful GET requests:

In [8]:
print(f'percentage of unsuccessful get request: {sum(sample_unique_movie_ids["movie_name"].isna())/len(sample_unique_movie_ids)}%')

percentage of unsuccessful get request: 0.0%


### Finally you can merge this table with your first table

In [9]:
print(f'Length before merge: {len(characters_with_wikidataID)}')
characters_with_wikidataID_and_english_names = pd.merge(left=characters_with_wikidataID, right=sample_unique_movie_ids, left_on='wikidata_id', right_on='wikidata_id', how='left')
print(f'Length after merge: {len(characters_with_wikidataID_and_english_names)}')
characters_with_wikidataID_and_english_names[['movie_freebase_id', 'movie_name']].loc[characters_with_wikidataID_and_english_names.movie_name.notna()].sample(10)

Length before merge: 450263
Length after merge: 450263


Unnamed: 0,movie_freebase_id,movie_name
279553,/m/0cnzj3j,Tutta la città canta
72058,/m/07k4pw_,The Great Pumpkin
270201,/m/0f4fk9,Say Uncle
4282,/m/02wxzpn,Locura de amor
99787,/m/02qkn0h,White Fang 2: Myth of the White Wolf
319072,/m/02pxz3w,The Red Rider
76623,/m/02h5xf_,The Hoax
920,/m/04lf5n7,Crazylegs
76616,/m/02h5xf_,The Hoax
370915,/m/03cy_2x,The Gambling Ghost


### Don't forget to clean up the now useless wikidata_id and freebase_id columns ;-)

In [10]:
characters_with_english_movie_names = characters_with_wikidataID_and_english_names.drop(['wikidata_id', 'freebase_id'],axis=1)
characters_with_english_movie_names.keys()

Index(['movie_wiki_id', 'movie_freebase_id', 'release_date', 'character_name',
       'birth', 'gender', 'height', 'ethnicity', 'actor_name', 'actor_age',
       'freebase_character_actor_map_id', 'freebase_character_id',
       'freebase_actor_id', 'release_year', 'birth_year', 'movie_name'],
      dtype='object')

In [37]:
movies = pickle.load(open(pickle_folder + 'movies.p','rb'))
len(movies)

81741