In [2]:
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import requests
import dask.dataframe as dd


In [3]:
data_folder = '../data/'
pickle_folder = '../data/pickles/'


movies_with_wikidata_id = pickle.load(open(pickle_folder + 'movies_with_wikidata_id.p', 'rb'))
unique_movie_ids = movies_with_wikidata_id.wikidata_id.unique()
unique_movie_ids

array(['Q261700', 'Q16250726', 'Q4978832', ..., 'Q4770308', 'Q2663931',
       'Q7578560'], dtype=object)

Should take around 10 hours, let's do it in 10 times one hour

In [4]:
unique_movie_ids[:10]

array(['Q261700', 'Q16250726', 'Q4978832', 'Q7995657', 'Q869644',
       'Q3067285', 'Q7765318', 'Q1619977', 'Q5165212', 'Q194106'],
      dtype=object)

In [5]:
from multiprocessing import Pool

wanted_features = ['P577', 'P444', 'P2142', 'P646', 'P345']

def get_wikidata_title(id):
    json_id = requests.get(f'https://www.wikidata.org/wiki/Special:EntityData/{id}.json')
    if json_id.ok:
        features = dict()
        json_id = json_id.json()
        key = list(json_id['entities'].keys())[0]
        features['new_wikidata_id'] = key
        for feature in wanted_features:
            try:
                features[feature] = json_id['entities'][key]['claims'][feature][0]['mainsnak']['datavalue']['value']
            except:
                pass
        return [id, features]
    else:
        print(json_id.status_code)
        return [id, json_id.status_code]
    # return json_id['entities'][id]['labels']['en']['value']


total_len = len(unique_movie_ids)

collection = []

with Pool(6) as p:
    collection =  list(tqdm(p.imap(get_wikidata_title, unique_movie_ids), total=total_len))
# pickle.dump(collection, open(f'temp_collection_movies.p','wb'))


  0%|          | 0/77109 [00:00<?, ?it/s]

400
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404


In [7]:
pickle.dump(collection, open(f'temp_collection_movies.p','wb'))


In [66]:
df = pd.DataFrame(collection)
failed_ids = df.loc[(df[1] == 400) | (df[1] == 404)]
pickle.dump(failed_ids, open('failed_movie_wikidata_ids.p','wb'))
df = df.drop(index=df.loc[(df[1] == 400) | (df[1] == 404)].index)
df = df.set_index(0)
df.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
Q261700,"{'new_wikidata_id': 'Q261700', 'P577': {'time'..."
Q16250726,"{'new_wikidata_id': 'Q16250726', 'P577': {'tim..."
Q4978832,"{'new_wikidata_id': 'Q4978832', 'P577': {'time..."
Q7995657,"{'new_wikidata_id': 'Q7995657', 'P577': {'time..."
Q869644,"{'new_wikidata_id': 'Q869644', 'P577': {'time'..."


In [67]:
def get_feature(x, feature, fct):
    try:
        return fct(x[feature])
    except:
        return np.nan

df['new_wikidata_id'] = df[1].apply(lambda x: get_feature(x, 'new_wikidata_id', lambda y: y))
df['release_date_wd'] = pd.to_datetime(df[1].apply(lambda x: get_feature(x, 'P577', (lambda y: y['time'].removeprefix('+').removesuffix('T00:00:00Z').replace('-00-00','').replace('-00','')))),errors = 'ignore')
df['box_office_wd'] = df[1].apply(lambda x: get_feature(x, 'P2142', lambda y: int(y['amount'].removeprefix('+'))))
df['ratings_wd'] = df[1].apply(lambda x: get_feature(x, 'P444', lambda y: y))
df['freebase_id_wd'] = df[1].apply(lambda x: get_feature(x, 'P646', lambda y: y))
df['IMDB_id'] = df[1].apply(lambda x: get_feature(x, 'P345', lambda y: y))
df = df.drop(labels=[1], axis=1)
df.head()

Unnamed: 0_level_0,new_wikidata_id,release_date_wd,box_office_wd,ratings_wd,freebase_id_wd,IMDB_id
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Q261700,Q261700,2001-01-01,,22%,/m/03vyhn,tt0228333
Q16250726,Q16250726,2000-01-01,,,/m/08yl5d,tt0245916
Q4978832,Q4978832,1988-01-01,,,/m/0crgdbh,tt0094806
Q7995657,Q7995657,1987-01-01,,50%,/m/0285_cd,tt0094320
Q869644,Q869644,1983-01-01,,,/m/01mrr1,tt0083949


In [68]:
movies = pd.merge(left=movies_with_wikidata_id, right=df, left_on='wikidata_id', right_index=True)
movies.keys()

Index(['Wikipedia ID', 'freebase_id', 'wikidata_id', 'name', 'release date',
       'box office revenue', 'runtime', 'languages', 'countries', 'genres',
       'new_wikidata_id', 'release_date_wd', 'box_office_wd', 'ratings_wd',
       'freebase_id_wd', 'IMDB_id'],
      dtype='object')

In [72]:
def non_matching_features(df, feature1, feature2):
    return df.loc[(df[feature1] != df[feature2]) & (df[feature1].notna() & df[feature2].notna())][[feature1, feature2]]

def at_least_one_feature(df, feature1, feature2):
    return df.loc[(df[feature1].isna() & df[feature2].notna()) | (df[feature1].notna() & df[feature2].isna())][[feature1, feature2]]

at_least_one_feature(movies, 'box office revenue', 'box_office_wd')

Unnamed: 0,box office revenue,box_office_wd
0,14010832.0,
7,3600000.0,
13,10161099.0,
21,21521.0,
29,29381649.0,
...,...,...
81720,15369573.0,
81725,6944471.0,
81726,22886222.0,
81732,1847671.0,


In [73]:
pickle.dump(movies, open(pickle_folder + 'movies_with_wikidata_entries_separate.p','wb'))