In [1]:
import pandas as pd
import os
import seaborn as sns
import requests
import json
from bs4 import BeautifulSoup as bs
import lxml.html as lh
from requests_html import HTMLSession
import re
from tqdm import tqdm
import pickle

## Import data

In [8]:
data_folder = 'data/'
pickle_folder = data_folder + 'pickles/'
MovieSummary_folder = data_folder + 'MovieSummaries/'

In [15]:
freebase_to_wikidata = pickle.load(open(pickle_folder + 'freebase_to_wikidata.p', 'rb'))
movies = pd.read_csv(MovieSummary_folder + "movie.metadata.tsv", sep='\t',header=None,names=["Wikipedia ID","Freebase ID","name","release date","box office revenue","runtime","languages","countries","genres"])


## Use wikipedia ID


In order to get as much wikidata IDs as possible as fast as possible, the first step is to use the freeebase_to_wikidata dataframe we found online

In [16]:
freebase_to_wikidata.head()

Unnamed: 0,freebase_id,wikidata_id
0,/m/0695j,Q6718
1,/m/05nrg,Q538
2,/m/0jgd,Q414
3,/m/0d_23,Q2537
4,/m/04g7d,Q315


In [17]:
movies.head()


Unnamed: 0,Wikipedia ID,Freebase ID,name,release date,box office revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


We now merge movie and freebase_to_wikidata. This will already fill wikidata_id of some of our films. 

In [18]:
movies = movies.rename(columns={"Freebase ID":"freebase_id"})
movies = movies.merge(freebase_to_wikidata, how="left", on="freebase_id")
movies.head()

Unnamed: 0,Wikipedia ID,freebase_id,name,release date,box office revenue,runtime,languages,countries,genres,wikidata_id
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",Q261700
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",Q4978832
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",Q869644


In [20]:
print(f'There are still {len(movies)-len(movies.loc[movies["wikidata_id"].notna()])} missing wikidata ids')

There are still 36326 missing wikidata ids


In order to complete the wikidata ids as much as possible, we are doing some web scraping.

In [1]:
def get_wd_id(wikipedia_id):

    r = requests.get("https://en.wikipedia.org/?curid="+str(wikipedia_id)) # find wikidata ID using html from wikipedia
    if r.status_code == 404:
        #print(f"wikipedia ID {wikipedia_id} not working")
        return
    
    wikidata_ID = (re.search("wgWikibaseItemId\":\"\w+",r.text))
    if wikidata_ID != None:
        wikidata_ID = wikidata_ID.group(0).split("\"",2)[2]
        return wikidata_ID
    else:
        #print("didnt find wgWikibaseItemId")
        return 

In [23]:
remaining = movies[movies["wikidata_id"].isna()]
remaining = remaining.drop(["wikidata_id"],axis = 1)
remaining.head()

Since running the whole webscraping took multiple hours, we decided to separate the remaining dataset in 3 sub datasets, to have checkpoints during the web scraping.

In [25]:
len = remaining.shape[0]
third = int(len/3)
remaining1 = remaining.iloc[:third]
remaining2 = remaining.iloc[third:2*third]
remaining3 = remaining.iloc[2*third:]

Here are the multiple scrapings, it took about 4 hours total. We save them to pickle to avoid losing progress if the jupyter crashes.

In [None]:
tqdm.pandas()
remaining1.insert(0,"wikidata_id",remaining1["Wikipedia ID"].progress_apply(lambda x: get_wd_id(x)))

100%|██████████| 12108/12108 [39:38<00:00,  5.09it/s] 


In [None]:
remaining1.to_pickle(pickle_folder + "rem1.pkl")

In [None]:
tqdm.pandas()
remaining2.insert(0,"wikidata_id",remaining2["Wikipedia ID"].progress_apply(lambda x: get_wd_id(x)))

100%|██████████| 12108/12108 [41:04<00:00,  4.91it/s]  


In [None]:
remaining2.to_pickle(pickle_folder + "rem2.pkl")

In [None]:
tqdm.pandas()
remaining3.insert(0,"wikidata_id",remaining3["Wikipedia ID"].progress_apply(lambda x: get_wd_id(x)))

100%|██████████| 12110/12110 [2:08:59<00:00,  1.56it/s]  


In [None]:
remaining3.to_pickle(pickle_folder + "rem3.pkl")

We combined the pickles together

In [None]:
masterclass = remaining1.append(remaining2.append(remaining3))

  masterclass = remaining1.append(remaining2.append(remaining3))
  masterclass = remaining1.append(remaining2.append(remaining3))


In [None]:
movies["wikidata_id"] = movies["wikidata_id"].fillna(masterclass["wikidata_id"])

In [29]:
print(f'There are still {movies["wikidata_id"].isna().sum()} missing wikidata entries')

There are still 4027 missing wikidata entries


In [None]:
new_cols = ["Wikipedia ID","freebase_id","wikidata_id","name","release date","box office revenue","runtime","languages","countries","genres"]
movies = movies[new_cols]
movies

Unnamed: 0,Wikipedia ID,freebase_id,wikidata_id,name,release date,box office revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Q261700,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Q16250726,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Q4978832,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,Q7995657,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,Q869644,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Q6819873,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}"
81737,34980460,/m/0g4pl34,Q12125420,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0..."
81738,9971909,/m/02pygw1,Q4770308,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}"
81739,913762,/m/03pcrp,Q2663931,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."


We save as pickle for further use

In [None]:
movies.to_pickle("movies_with_wikidata_id.pkl")