In [1]:
import os
import pandas as pd

# Data Augmentation With Wikidata
In this notebook we show how to use KGTK to augment the Rotten Tomatoes movie dataset from Kaggle file with movie and actor awards from Wikidata. 

Download the `rotten_tomatoes_movies.csv` from https://www.kaggle.com/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset?select=rotten_tomatoes_movies.csv. Once you download the file, define an enviroment variable to record where you put the file.

**Edit the cell below to record the path where you put the downloaded file**

In [2]:
os.environ['RT'] = "/Users/pedroszekely/Downloads/rotten_tomatoes_movies.csv"

In [3]:
!head -1 $RT

rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count


**Some preliminaries to set up KGTK to access the Wikidata files. You need to get the files from our download site (instructions coming soon)**

In [4]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/Users/pedroszekely/Downloads/kypher"

# The names of the output and temporary folders
output_folder = "wikidata_os_v5"
temp_folder = "temp.wikidata_os_v5"

# The location of input Wikidata files
wikidata_folder = "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/"
wikidata_folder = "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/"
# The wikidata_os files can be downloaded from https://drive.google.com/drive/folders/1V6oAQKmwQ4LJnrBai-uv5gHWphFSCt50?usp=sharing

# Location of the cache database for kypher
cache_path = "/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4"
# cache_path = "/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db"
# Whether to delete the cache database
delete_database = False

# shortcuts to commands
kgtk = "time kgtk --debug"
# kgtk = "kgtk --debug"

In [5]:
# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
file_names = {
    "claims": "claims.tsv.gz",
    "label": "labels.en.tsv.gz",
    "alias": "aliases.en.tsv.gz",
    "description": "descriptions.en.tsv.gz",
    "item": "claims.wikibase-item.tsv.gz",
    "qualifiers": "qualifiers.tsv.gz",
    "sitelinks": "sitelinks.tsv.gz",
    "qualifiers_time": "qualifiers.time.tsv.gz",
    "property_datatypes": "metadata.property.datatypes.tsv.gz",
    "isa": "derived.isa.tsv.gz",
    "p279star": "derived.P279star.tsv.gz",
    "p279": "derived.P279.tsv.gz",
    "p31": "derived.P31.tsv.gz"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['WIKIDATA'] = wikidata_folder
kgtk_environment_variables.append('WIKIDATA')

for key, value in file_names.items():
    variable = key.upper()
    os.environ[variable] = wikidata_folder + value
    kgtk_environment_variables.append(variable)
    
# KGTK creates a SQLite database to index the knowledge graph.
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
kgtk_environment_variables.append('STORE')

# We will create many temporary files, so set up a folder for outputs and one for the temporary files.
os.environ['TEMP'] = "{}/{}".format(output_path, temp_folder) 
os.environ['OUT'] = "{}/{}".format(output_path, output_folder) 
kgtk_environment_variables.append('TEMP')
kgtk_environment_variables.append('OUT')

# Envronment variables with shortcuts to the commands we use often
os.environ['kgtk'] = kgtk
os.environ['kypher'] = "time kgtk --debug query --graph-cache " + os.environ['STORE']
os.environ['kypher'] = "kgtk query --graph-cache " + os.environ['STORE']
kgtk_environment_variables.append('kgtk')
kgtk_environment_variables.append('kypher')

# We'll save the current working directory so we can call into other example notebooks later
os.environ["EXAMPLES_DIR"] = os.getcwd()
kgtk_environment_variables.append('EXAMPLES_DIR')

kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

ALIAS: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/aliases.en.tsv.gz"
CLAIMS: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.tsv.gz"
DESCRIPTION: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/descriptions.en.tsv.gz"
EXAMPLES_DIR: "/Users/pedroszekely/Documents/GitHub/kgtk/examples"
ISA: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.isa.tsv.gz"
ITEM: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.wikibase-item.tsv.gz"
LABEL: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/labels.en.tsv.gz"
OUT: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v5"
P279: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P279.tsv.gz"
P279STAR: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P279star.tsv.gz"
P31: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P31.tsv.gz"
PROPERTY_DATATYPES: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/metadata.property.datatypes.tsv.gz"
QUALIFIERS: "/Users/pedroszeke

In [6]:
%cd {output_path}

/Users/pedroszekely/Downloads/kypher


### Import the Rotten Tomatoes movie file in KGTK

1. use the `cat` command to convert it to TSV format 
1. use the `rename-columns` command to rename the `rotten_tomatoes_link` column to `node1` so now we have a legal KGTK file
1. store the result in `rt-movies.tsv`

In [7]:
!$kgtk cat --mode NONE -i $RT --input-format csv \
/ rename-columns --mode NONE --old-columns rotten_tomatoes_link --new-columns node1 \
-o $TEMP/rt-movies.tsv

        7.37 real         5.84 user         0.56 sys


In [8]:
!head -2 $TEMP/rt-movies.tsv

node1	movie_title	movie_info	critics_consensus	content_rating	genres	directors	authors	actors	original_release_date	streaming_release_date	runtime	production_company	tomatometer_status	tomatometer_rating	tomatometer_count	audience_status	audience_rating	audience_count	tomatometer_top_critics_count	tomatometer_fresh_critics_count	tomatometer_rotten_critics_count
m/0814255	Percy Jackson & the Olympians: The Lightning Thief	"Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he\'s the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld."	"Though it may seem like just another Harry Potter knockoff, Percy Jackson benefits from a strong supporting cast, a 

### Query Wikidata to get the count of awards for each movie in the Kaggle dataset 

- `rt-movies.tsv`: the Rotten Tomatoes KGTK file
- `claims`: the Wikidata file containing all claims (statements)
- `rt: (movie_id)-[]->()`: assign to the `movie_id` variable all the movie ids in the the Rotten Tomatoes file
- `claims: (movie)-[:P1258]->(rt_id)`: get the Rotten Tomatoes ids for all movies in Wikidata; note that the ids in Wikidata are strings
- `claims: (movie)-[aid:P166]->(award)`: get the awards of the movies that have Rotten Tomatoes ids
- `--where 'kgtk_unstringify(rt_id) = movie_id'`: do the join on the `movie_id` in the input file and the Rotten Tomatoes ids `rt_id` without the quotes
- `--return 'distinct movie_id as node1, count(distinct aid) as award_count' `: return the movie ids and the count of awards
- `-order-by 'award_count desc'`: order the results descending by count of awards

In [9]:
%%time
!$kypher -i $TEMP/rt-movies.tsv --as movies -i $CLAIMS --as claims \
--match 'movies: (movie_id)-[]->(), claims: (movie)-[:P1258]->(rt_id), claims: (movie)-[aid:P166]->(award)' \
--where 'kgtk_unstringify(rt_id) = movie_id' \
--return 'distinct movie_id as node1, count(distinct aid) as award_count' \
--order-by 'award_count desc' \
-o $TEMP/rt-awards.tsv

CPU times: user 479 ms, sys: 142 ms, total: 622 ms
Wall time: 31.6 s


Look at the file we got

In [10]:
!wc $TEMP/rt-awards.tsv

    1855    3710   39062 /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/rt-awards.tsv


In [11]:
!head $TEMP/rt-awards.tsv

node1	award_count
m/1121720-devdas	45
m/queen	44
m/lost_in_translation	43
m/barfi	34
m/piano	31
m/titanic	30
m/veer_zaara	25
m/silver_linings_playbook	24
m/chak_de_india	24


Add the award counts to the Kaggle file by 

- doing a left `join` on the `node1` column 
- `compact`ing the file to remove duplicate rows

In [12]:
!$kgtk join  --mode NONE --left-file $TEMP/rt-movies.tsv --right-file $TEMP/rt-awards.tsv --left-join \
  --left-file-join-columns node1 --right-file-join-columns node1 \
/ compact --mode NONE --columns node1 \
-o $TEMP/rt-movies.awards.tsv

        3.64 real         4.19 user         0.48 sys


Load in Pandas, the `award_count` column we added is at the end

In [13]:
pd.read_table(os.environ['TEMP']+"/rt-movies.awards.tsv", sep="\t").sort_values(by=['award_count'], ascending=False)

Unnamed: 0,node1,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,award_count
1413,m/1121720-devdas,Devdas,Devdas (Shahrukh Khan) makes his way back home...,,PG,"Art House & International, Drama, Musical & Pe...",Sanjay Leela Bhansali,"Sanjay Leela Bhansali, Prakash Kapadia","Shah Rukh Khan, Aishwarya Rai Bachchan, Madhur...",2002-07-12,...,Fresh,89.0,19.0,Upright,88.0,18638.0,4,17,2,45.0
12180,m/queen,The Queen,Following the death of Princess Diana in an au...,"Full of wit, humor, and pathos, Stephen Frears...",PG-13,Drama,Stephen Frears,Peter Morgan,"Helen Mirren, Michael Sheen, James Cromwell, S...",2006-10-06,...,Certified-Fresh,96.0,198.0,Upright,76.0,175607.0,51,191,7,44.0
9747,m/lost_in_translation,Lost In Translation,"A lonely, aging movie star named Bob Harris (B...","Effectively balancing humor and subtle pathos,...",R,"Comedy, Drama, Romance",Sofia Coppola,Sofia Coppola,"Bill Murray, Scarlett Johansson, Giovanni Ribi...",2003-09-26,...,Certified-Fresh,95.0,233.0,Upright,85.0,337401.0,46,221,12,43.0
3193,m/barfi,Barfi!,"Shruti loves Barfi, a hearing and speech-impai...",,NR,"Action & Adventure, Comedy, Drama",,,Haradhan Bandopadhyay,2012-09-14,...,Fresh,86.0,14.0,Upright,86.0,4905.0,5,12,2,34.0
11816,m/piano,The Piano,"After a long voyage from Scotland, pianist Ada...","Powered by Holly Hunter\'s main performance, T...",R,"Art House & International, Drama, Romance",Jane Campion,Jane Campion,"Holly Hunter, Harvey Keitel, Anna Paquin, Sam ...",1993-11-19,...,Certified-Fresh,92.0,61.0,Upright,86.0,48469.0,15,56,5,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17705,m/zoolander_2,Zoolander 2,Former models Derek Zoolander (Ben Stiller) an...,Zoolander No. 2 has more celebrity cameos than...,PG-13,Comedy,Ben Stiller,"Nicholas Stoller, John Hamburg, Ben Stiller, J...","Ben Stiller, Owen Wilson, Will Ferrell, Penelo...",2016-02-12,...,Rotten,22.0,232.0,Spilled,20.0,41745.0,48,51,181,
17706,m/zoom_2006,Zoom,"Capt. Zoom, or Jack (Tim Allen), as he is now ...",Lacking the punch and good cheer of The Incred...,PG,"Action & Adventure, Comedy, Kids & Family",Peter Hewitt,"Adam Rifkin, David Berenbaum","Tim Allen, Courteney Cox, Chevy Chase, Spencer...",2006-08-11,...,Rotten,4.0,68.0,Spilled,33.0,11369.0,19,3,65,
17707,m/zoot_suit,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,,R,"Drama, Musical & Performing Arts",Luis Valdez,Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid...",1981-10-02,...,Rotten,56.0,9.0,Upright,74.0,1195.0,2,5,4,
17710,m/zulu,Zulu,"In 1879, the Zulu nation hands colonial Britis...",Zulu patiently establishes a cast of colorful ...,PG,"Classics, Drama","Cy Endfield, Cyril Endfield","Cy Endfield, John Prebble","Stanley Baker, Jack Hawkins, Ulla Jacobsson, J...",1964-06-17,...,Fresh,96.0,23.0,Upright,91.0,30193.0,6,22,1,


### Query Wikidata again to add the count of awards of the cast of the movies
The query is similar to the previous one, except that we chain from the movies to the cast and from the cast to the awards.

In [14]:
%%time
!$kypher -i movies -i claims \
--match 'movies: (movie_id)-[]->(), claims: (movie)-[:P1258]->(rt_id), claims: (movie)-[:P161]->(cast)-[cid:P166]->(award)' \
--where 'kgtk_unstringify(rt_id) = movie_id' \
--return 'distinct movie_id as node1, count(distinct cid) as cast_award_count' \
--order-by 'cast_award_count desc' \
-o $TEMP/rt-cast-awards.tsv

CPU times: user 10.2 s, sys: 2.93 s, total: 13.1 s
Wall time: 10min 40s


Repeat the join/compact command to add the `cast_award_count` column to the file

In [15]:
!$kgtk join --mode NONE --left-file $TEMP/rt-movies.awards.tsv --right-file $TEMP/rt-cast-awards.tsv --left-join \
  --left-file-join-columns node1 --right-file-join-columns node1 \
/ compact --mode NONE --columns node1 \
-o $TEMP/rt-movies.awards.movies.cast.tsv

        4.18 real         4.57 user         0.55 sys


In [16]:
df = pd.read_table(os.environ['TEMP']+"/rt-movies.awards.movies.cast.tsv", sep="\t")
df.sort_values(by=['award_count'], ascending=False)

Unnamed: 0,node1,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,award_count,cast_award_count
1413,m/1121720-devdas,Devdas,Devdas (Shahrukh Khan) makes his way back home...,,PG,"Art House & International, Drama, Musical & Pe...",Sanjay Leela Bhansali,"Sanjay Leela Bhansali, Prakash Kapadia","Shah Rukh Khan, Aishwarya Rai Bachchan, Madhur...",2002-07-12,...,89.0,19.0,Upright,88.0,18638.0,4,17,2,45.0,11.0
12180,m/queen,The Queen,Following the death of Princess Diana in an au...,"Full of wit, humor, and pathos, Stephen Frears...",PG-13,Drama,Stephen Frears,Peter Morgan,"Helen Mirren, Michael Sheen, James Cromwell, S...",2006-10-06,...,96.0,198.0,Upright,76.0,175607.0,51,191,7,44.0,21.0
9747,m/lost_in_translation,Lost In Translation,"A lonely, aging movie star named Bob Harris (B...","Effectively balancing humor and subtle pathos,...",R,"Comedy, Drama, Romance",Sofia Coppola,Sofia Coppola,"Bill Murray, Scarlett Johansson, Giovanni Ribi...",2003-09-26,...,95.0,233.0,Upright,85.0,337401.0,46,221,12,43.0,12.0
3193,m/barfi,Barfi!,"Shruti loves Barfi, a hearing and speech-impai...",,NR,"Action & Adventure, Comedy, Drama",,,Haradhan Bandopadhyay,2012-09-14,...,86.0,14.0,Upright,86.0,4905.0,5,12,2,34.0,34.0
11816,m/piano,The Piano,"After a long voyage from Scotland, pianist Ada...","Powered by Holly Hunter\'s main performance, T...",R,"Art House & International, Drama, Romance",Jane Campion,Jane Campion,"Holly Hunter, Harvey Keitel, Anna Paquin, Sam ...",1993-11-19,...,92.0,61.0,Upright,86.0,48469.0,15,56,5,31.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17705,m/zoolander_2,Zoolander 2,Former models Derek Zoolander (Ben Stiller) an...,Zoolander No. 2 has more celebrity cameos than...,PG-13,Comedy,Ben Stiller,"Nicholas Stoller, John Hamburg, Ben Stiller, J...","Ben Stiller, Owen Wilson, Will Ferrell, Penelo...",2016-02-12,...,22.0,232.0,Spilled,20.0,41745.0,48,51,181,,96.0
17706,m/zoom_2006,Zoom,"Capt. Zoom, or Jack (Tim Allen), as he is now ...",Lacking the punch and good cheer of The Incred...,PG,"Action & Adventure, Comedy, Kids & Family",Peter Hewitt,"Adam Rifkin, David Berenbaum","Tim Allen, Courteney Cox, Chevy Chase, Spencer...",2006-08-11,...,4.0,68.0,Spilled,33.0,11369.0,19,3,65,,
17707,m/zoot_suit,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,,R,"Drama, Musical & Performing Arts",Luis Valdez,Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid...",1981-10-02,...,56.0,9.0,Upright,74.0,1195.0,2,5,4,,4.0
17710,m/zulu,Zulu,"In 1879, the Zulu nation hands colonial Britis...",Zulu patiently establishes a cast of colorful ...,PG,"Classics, Drama","Cy Endfield, Cyril Endfield","Cy Endfield, John Prebble","Stanley Baker, Jack Hawkins, Ulla Jacobsson, J...",1964-06-17,...,96.0,23.0,Upright,91.0,30193.0,6,22,1,,33.0
