This notebook assigns each titles to a unique region of origin.

In [68]:
import importlib
import src.imdb_views as imdb_views
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

_ = importlib.reload(imdb_views)

In [2]:
reader = imdb_views.IMDBDataReader("../../PublicDatasets/IMDB/")

In [3]:
ratings = reader.get_ratings()
title_basics = reader.get_title_basics()

# take intersection of titles with ratings and basic information to reduce the size of the dataset
titles_with_ratings_and_basic_info = \
    ratings[['tconst']].drop_duplicates()\
    .merge(title_basics[['tconst']].drop_duplicates(), on='tconst', how='inner')
ratings = ratings.merge(titles_with_ratings_and_basic_info, on='tconst', how='inner')
title_basics = title_basics.merge(titles_with_ratings_and_basic_info, on='tconst', how='inner')

title_regions = reader.get_title_regions(titles_to_fetch=titles_with_ratings_and_basic_info)

# keep only titles with regions
titles_w_ratings_basic_info_and_regions = \
    titles_with_ratings_and_basic_info\
    .merge(title_regions[['tconst']].drop_duplicates(), on='tconst', how='inner')

ratings = ratings.merge(titles_w_ratings_basic_info_and_regions, on='tconst', how='inner')
title_basics = title_basics.merge(titles_w_ratings_basic_info_and_regions, on='tconst', how='inner')
title_regions = title_regions.merge(titles_w_ratings_basic_info_and_regions, on='tconst', how='inner')

title_to_names = reader.get_title_to_names(titles_to_fetch=titles_w_ratings_basic_info_and_regions)

Time taken to read ratings data: 0.6358175277709961 seconds
Time taken to read title basics data: 17.683849334716797 seconds
Time taken to read title regions data: 66.46048855781555 seconds
Time taken to compute the final title to names dataframe: 1.8792967875798543 mins


In [4]:
# naive method: assign each name to region of origin by picking
# the region containing most votes for that name
# then assign each title to region of origin by picking
# region containing most votes for the names of the title

In [27]:
name_to_region_of_origin = title_to_names\
    .merge(title_basics[['tconst', 'titleType']], on='tconst')\
    .merge(ratings[['tconst', 'numVotes']], on='tconst', how='inner')\
    .merge(title_regions.query('language == "ALL"')[['tconst', 'region']], on='tconst', how='inner')\
    .groupby(['titleType', 'primaryName', 'region'])\
    .agg({'numVotes': 'sum', 'tconst': 'count'})\
    .reset_index()\
    .rename(columns={'tconst': 'numTitles'})
    
name_to_region_of_origin['RegionRankPerName'] = \
    name_to_region_of_origin.groupby(['titleType', 'primaryName'])['numVotes'].rank(method='dense', ascending=False)

name_to_region_of_origin['TotalNumVotes'] = \
    name_to_region_of_origin.groupby(['titleType', 'primaryName'])['numVotes'].transform('sum')

name_to_region_of_origin = name_to_region_of_origin.query('RegionRankPerName == 1')

In [28]:
# what fraction of names have inconclusive region of origin at this point?
# i.e., they have more than 1 regions tied in terms of number of votes

name_to_num_regions_of_origin = name_to_region_of_origin\
    .groupby(['titleType', 'primaryName'])\
    .agg({'region': 'count', 'TotalNumVotes': 'sum'})\
    .reset_index()\
    .rename(columns={'region': 'numRegionsOfOrigin'})

stats = name_to_num_regions_of_origin\
    .groupby(['titleType', 'numRegionsOfOrigin'])\
    .agg({'primaryName': 'count', 'TotalNumVotes': 'sum'})\
    .reset_index()\
    .rename(columns={'primaryName': 'numNames'})

stats['PercentageByCount'] = 100 * stats['numNames'] / stats.groupby('titleType')['numNames'].transform('sum')
stats['PercentageByVotes'] = 100 * stats['TotalNumVotes'] / stats.groupby('titleType')['TotalNumVotes'].transform('sum')

stats


Unnamed: 0,titleType,numRegionsOfOrigin,numNames,TotalNumVotes,PercentageByCount,PercentageByVotes
0,movie,1,357416,46863805902,50.623488,26.581733
1,movie,2,173198,29245120016,24.531322,16.588196
2,movie,3,69014,12145261785,9.774966,6.888943
3,movie,4,40198,12129870564,5.693542,6.880213
4,movie,5,27111,11094158140,3.839933,6.292744
5,movie,6,20379,15910553616,2.886429,9.024664
6,movie,7,18712,48912021333,2.65032,27.743506
7,tvSeries,1,126200,4891346274,64.681793,17.812048
8,tvSeries,2,28876,1199751788,14.799932,4.368948
9,tvSeries,3,12775,1300489344,6.547622,4.735788


In [30]:
# what are some big names who can't be assigned to a particular region?

big_names = name_to_num_regions_of_origin.query('numRegionsOfOrigin > 1')
big_names['Rank'] = big_names.groupby('titleType')['TotalNumVotes'].rank(method='dense', ascending=False)
big_names = big_names.query('Rank <= 10')

big_names.sort_values(by=['titleType', 'Rank'])


Unnamed: 0,titleType,primaryName,numRegionsOfOrigin,TotalNumVotes,Rank
125266,movie,Christopher Nolan,7,790761461,1.0
152817,movie,David Fincher,7,442664432,2.0
119614,movie,Chris Hemsworth,6,438654798,3.0
653019,movie,Tom Cruise,5,425783725,4.0
624106,movie,Steven Spielberg,4,406357624,5.0
553746,movie,Ridley Scott,6,371050212,6.0
386152,movie,Leonardo DiCaprio,3,369096153,7.0
72753,movie,Ben Affleck,5,357921835,8.0
596142,movie,Seth Rogen,6,347334312,9.0
674462,movie,Vin Diesel,6,343233660,10.0


In [31]:
# okay. So these are extremely popular artists whose movies pretty much
# always get released in more than 1 region

In [32]:
# well, our original problem is to assign titles to region of origin
# and not names to region of origin. May be we keep soft labels at name level
# and then see if we have enough discrimatory power at title level 
# when we aggregate them by names associated with the title

In [38]:
name_to_region_of_origin_soft_labels = title_to_names\
    .merge(title_basics[['tconst', 'titleType']], on='tconst')\
    .merge(ratings[['tconst', 'numVotes']], on='tconst', how='inner')\
    .merge(title_regions.query('language == "ALL"')[['tconst', 'region']], on='tconst', how='inner')\
    .groupby(['titleType', 'primaryName', 'category', 'region'])\
    .agg({'numVotes': 'sum', 'tconst': 'count'})\
    .reset_index()\
    .rename(columns={'tconst': 'numTitles'})\
    .sort_values(by=['titleType', 'primaryName', 'category', 'numVotes'], ascending=False)

In [39]:
name_to_region_of_origin_soft_labels.query('primaryName == "Leonardo DiCaprio"')

Unnamed: 0,titleType,primaryName,category,region,numVotes,numTitles
2595237,tvSeries,Leonardo DiCaprio,actor,Australia,16719,2
2595238,tvSeries,Leonardo DiCaprio,actor,Canada,16719,2
2595241,tvSeries,Leonardo DiCaprio,actor,Japan,16719,2
2595242,tvSeries,Leonardo DiCaprio,actor,UK,16719,2
2595243,tvSeries,Leonardo DiCaprio,actor,US,16719,2
2595239,tvSeries,Leonardo DiCaprio,actor,France,16323,1
2595240,tvSeries,Leonardo DiCaprio,actor,India,16323,1
1253469,movie,Leonardo DiCaprio,actor,Canada,17585811,33
1253473,movie,Leonardo DiCaprio,actor,UK,17585811,33
1253474,movie,Leonardo DiCaprio,actor,US,17585811,33


In [40]:
name_to_region_of_origin_soft_labels.query('primaryName == "Aamir Khan"')

Unnamed: 0,titleType,primaryName,category,region,numVotes,numTitles
2573,movie,Aamir Khan,director,Australia,206478,1
2574,movie,Aamir Khan,director,Canada,206478,1
2575,movie,Aamir Khan,director,France,206478,1
2576,movie,Aamir Khan,director,India,206478,1
2577,movie,Aamir Khan,director,Japan,206478,1
2578,movie,Aamir Khan,director,UK,206478,1
2579,movie,Aamir Khan,director,US,206478,1
2569,movie,Aamir Khan,actor,India,2089899,50
2572,movie,Aamir Khan,actor,US,2088929,49
2571,movie,Aamir Khan,actor,UK,2054418,35


In [42]:
title_to_region_of_origin_soft_labels = title_to_names\
    .merge(title_basics[['tconst', 'titleType']], on='tconst')\
    .merge(name_to_region_of_origin_soft_labels, on=['titleType', 'primaryName', 'category'], how='inner')\
    .groupby(['titleType', 'tconst', 'region'])\
    .agg({'numVotes': 'sum'})\
    .reset_index()\
    .merge(title_basics[['tconst', 'primaryTitle', 'startYear', 'genres']], on='tconst')\
    .sort_values(by=['titleType', 'tconst', 'numVotes'], ascending=False)

In [44]:
title_to_region_of_origin_soft_labels.query('primaryTitle == "3 Idiots"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1172207,movie,tt3685624,US,1250078,3 Idiots,2017,Comedy
1172206,movie,tt3685624,UK,1239491,3 Idiots,2017,Comedy
1172202,movie,tt3685624,Canada,1225339,3 Idiots,2017,Comedy
1172201,movie,tt3685624,Australia,1219611,3 Idiots,2017,Comedy
1172204,movie,tt3685624,India,1210170,3 Idiots,2017,Comedy
1172203,movie,tt3685624,France,1059099,3 Idiots,2017,Comedy
1172205,movie,tt3685624,Japan,1057484,3 Idiots,2017,Comedy
776279,movie,tt1187043,India,10544508,3 Idiots,2009,"Comedy,Drama"
776281,movie,tt1187043,UK,10361197,3 Idiots,2009,"Comedy,Drama"
776282,movie,tt1187043,US,10300775,3 Idiots,2009,"Comedy,Drama"


In [45]:
title_to_region_of_origin_soft_labels.query('primaryTitle == "Inception"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
836440,movie,tt1375666,US,92177919,Inception,2010,"Action,Adventure,Sci-Fi"
836439,movie,tt1375666,UK,92176116,Inception,2010,"Action,Adventure,Sci-Fi"
836435,movie,tt1375666,Canada,92132678,Inception,2010,"Action,Adventure,Sci-Fi"
836438,movie,tt1375666,Japan,92066306,Inception,2010,"Action,Adventure,Sci-Fi"
836434,movie,tt1375666,Australia,92026924,Inception,2010,"Action,Adventure,Sci-Fi"
836436,movie,tt1375666,France,91986868,Inception,2010,"Action,Adventure,Sci-Fi"
836437,movie,tt1375666,India,91209580,Inception,2010,"Action,Adventure,Sci-Fi"


In [48]:
title_to_region_of_origin_soft_labels.query('tconst == "tt4154756"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1198042,movie,tt4154756,US,108321403,Avengers: Infinity War,2018,"Action,Adventure,Sci-Fi"
1198041,movie,tt4154756,UK,108319498,Avengers: Infinity War,2018,"Action,Adventure,Sci-Fi"
1198037,movie,tt4154756,Canada,108299919,Avengers: Infinity War,2018,"Action,Adventure,Sci-Fi"
1198036,movie,tt4154756,Australia,108294133,Avengers: Infinity War,2018,"Action,Adventure,Sci-Fi"
1198040,movie,tt4154756,Japan,108239333,Avengers: Infinity War,2018,"Action,Adventure,Sci-Fi"
1198038,movie,tt4154756,France,108191549,Avengers: Infinity War,2018,"Action,Adventure,Sci-Fi"
1198039,movie,tt4154756,India,107085333,Avengers: Infinity War,2018,"Action,Adventure,Sci-Fi"


In [49]:
title_to_region_of_origin_soft_labels.query('tconst == "tt0112870"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
348037,movie,tt0112870,India,8481913,Dilwale Dulhania Le Jayenge,1995,"Drama,Romance"
348040,movie,tt0112870,US,8120542,Dilwale Dulhania Le Jayenge,1995,"Drama,Romance"
348039,movie,tt0112870,UK,8054169,Dilwale Dulhania Le Jayenge,1995,"Drama,Romance"
348035,movie,tt0112870,Canada,7622454,Dilwale Dulhania Le Jayenge,1995,"Drama,Romance"
348034,movie,tt0112870,Australia,7027477,Dilwale Dulhania Le Jayenge,1995,"Drama,Romance"
348038,movie,tt0112870,Japan,6187528,Dilwale Dulhania Le Jayenge,1995,"Drama,Romance"
348036,movie,tt0112870,France,5958008,Dilwale Dulhania Le Jayenge,1995,"Drama,Romance"


In [50]:
title_to_region_of_origin_soft_labels.query('tconst == "tt6470478"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1671089,tvSeries,tt6470478,US,1982446,The Good Doctor,2017,Drama
1671084,tvSeries,tt6470478,Canada,1980742,The Good Doctor,2017,Drama
1671083,tvSeries,tt6470478,Australia,1979389,The Good Doctor,2017,Drama
1671088,tvSeries,tt6470478,UK,1977032,The Good Doctor,2017,Drama
1671087,tvSeries,tt6470478,Japan,1972324,The Good Doctor,2017,Drama
1671086,tvSeries,tt6470478,India,1962196,The Good Doctor,2017,Drama
1671085,tvSeries,tt6470478,France,1961234,The Good Doctor,2017,Drama


In [51]:
title_to_region_of_origin_soft_labels.query('tconst == "tt0988824"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1500914,tvSeries,tt0988824,US,3550555,Naruto: Shippuden,2007,"Action,Adventure,Animation"
1500912,tvSeries,tt0988824,Japan,3538257,Naruto: Shippuden,2007,"Action,Adventure,Animation"
1500909,tvSeries,tt0988824,Canada,3505570,Naruto: Shippuden,2007,"Action,Adventure,Animation"
1500913,tvSeries,tt0988824,UK,3488499,Naruto: Shippuden,2007,"Action,Adventure,Animation"
1500911,tvSeries,tt0988824,India,3486013,Naruto: Shippuden,2007,"Action,Adventure,Animation"
1500908,tvSeries,tt0988824,Australia,3444041,Naruto: Shippuden,2007,"Action,Adventure,Animation"
1500910,tvSeries,tt0988824,France,3435617,Naruto: Shippuden,2007,"Action,Adventure,Animation"


In [52]:
title_to_region_of_origin_soft_labels.query('tconst == "tt2098220"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1586032,tvSeries,tt2098220,Japan,2398355,Hunter x Hunter,2011,"Action,Adventure,Animation"
1586034,tvSeries,tt2098220,US,2375216,Hunter x Hunter,2011,"Action,Adventure,Animation"
1586031,tvSeries,tt2098220,India,2348794,Hunter x Hunter,2011,"Action,Adventure,Animation"
1586033,tvSeries,tt2098220,UK,2334319,Hunter x Hunter,2011,"Action,Adventure,Animation"
1586029,tvSeries,tt2098220,Canada,2333899,Hunter x Hunter,2011,"Action,Adventure,Animation"
1586028,tvSeries,tt2098220,Australia,2257774,Hunter x Hunter,2011,"Action,Adventure,Animation"
1586030,tvSeries,tt2098220,France,2221810,Hunter x Hunter,2011,"Action,Adventure,Animation"


In [53]:
title_to_region_of_origin_soft_labels.query('tconst == "tt2560140"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1606451,tvSeries,tt2560140,Japan,6411019,Attack on Titan,2013,"Action,Adventure,Animation"
1606453,tvSeries,tt2560140,US,6385352,Attack on Titan,2013,"Action,Adventure,Animation"
1606450,tvSeries,tt2560140,India,6355637,Attack on Titan,2013,"Action,Adventure,Animation"
1606452,tvSeries,tt2560140,UK,6330105,Attack on Titan,2013,"Action,Adventure,Animation"
1606448,tvSeries,tt2560140,Canada,6318066,Attack on Titan,2013,"Action,Adventure,Animation"
1606449,tvSeries,tt2560140,France,6216899,Attack on Titan,2013,"Action,Adventure,Animation"
1606447,tvSeries,tt2560140,Australia,6187606,Attack on Titan,2013,"Action,Adventure,Animation"


In [54]:
title_to_region_of_origin_soft_labels.query('tconst == "tt0386676"') # the office US

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1478234,tvSeries,tt0386676,US,7542002,The Office,2005,Comedy
1478229,tvSeries,tt0386676,Canada,7539751,The Office,2005,Comedy
1478228,tvSeries,tt0386676,Australia,7537647,The Office,2005,Comedy
1478233,tvSeries,tt0386676,UK,7533215,The Office,2005,Comedy
1478231,tvSeries,tt0386676,India,7531986,The Office,2005,Comedy
1478230,tvSeries,tt0386676,France,7518991,The Office,2005,Comedy
1478232,tvSeries,tt0386676,Japan,7512407,The Office,2005,Comedy


In [55]:
title_to_region_of_origin_soft_labels.query('tconst == "tt0290978"') # the office UK

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1466177,tvSeries,tt0290978,UK,3202907,The Office,2001,"Comedy,Drama"
1466178,tvSeries,tt0290978,US,3200675,The Office,2001,"Comedy,Drama"
1466173,tvSeries,tt0290978,Canada,3199899,The Office,2001,"Comedy,Drama"
1466172,tvSeries,tt0290978,Australia,3199306,The Office,2001,"Comedy,Drama"
1466176,tvSeries,tt0290978,Japan,3194956,The Office,2001,"Comedy,Drama"
1466174,tvSeries,tt0290978,France,3191432,The Office,2001,"Comedy,Drama"
1466175,tvSeries,tt0290978,India,3189278,The Office,2001,"Comedy,Drama"


In [56]:
title_to_region_of_origin_soft_labels.query('tconst == "tt0290988"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1466212,tvSeries,tt0290988,Canada,589542,Trailer Park Boys,2001,"Comedy,Crime"
1466217,tvSeries,tt0290988,US,589286,Trailer Park Boys,2001,"Comedy,Crime"
1466211,tvSeries,tt0290988,Australia,585180,Trailer Park Boys,2001,"Comedy,Crime"
1466213,tvSeries,tt0290988,France,584512,Trailer Park Boys,2001,"Comedy,Crime"
1466215,tvSeries,tt0290988,Japan,576850,Trailer Park Boys,2001,"Comedy,Crime"
1466216,tvSeries,tt0290988,UK,576850,Trailer Park Boys,2001,"Comedy,Crime"
1466214,tvSeries,tt0290988,India,546409,Trailer Park Boys,2001,"Comedy,Crime"


In [57]:
title_to_region_of_origin_soft_labels.query('tconst == "tt5687612"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres
1660610,tvSeries,tt5687612,UK,3887578,Fleabag,2016,"Comedy,Drama"
1660611,tvSeries,tt5687612,US,3883937,Fleabag,2016,"Comedy,Drama"
1660606,tvSeries,tt5687612,Canada,3883467,Fleabag,2016,"Comedy,Drama"
1660605,tvSeries,tt5687612,Australia,3882265,Fleabag,2016,"Comedy,Drama"
1660609,tvSeries,tt5687612,Japan,3863674,Fleabag,2016,"Comedy,Drama"
1660608,tvSeries,tt5687612,India,3826713,Fleabag,2016,"Comedy,Drama"
1660607,tvSeries,tt5687612,France,3798608,Fleabag,2016,"Comedy,Drama"


In [59]:
# okay! This seems to be working well! What fraction of titles are still inconclusive at this point?

title_to_region_of_origin_hard_labels = title_to_region_of_origin_soft_labels.copy()

title_to_region_of_origin_hard_labels['RegionRankPerTitle'] = \
    title_to_region_of_origin_hard_labels.groupby(['titleType', 'tconst'])['numVotes'].rank(method='dense', ascending=False)

title_to_region_of_origin_hard_labels = title_to_region_of_origin_hard_labels.query('RegionRankPerTitle == 1')

In [60]:
title_to_region_of_origin_hard_labels.query('tconst == "tt5687612"')

Unnamed: 0,titleType,tconst,region,numVotes,primaryTitle,startYear,genres,RegionRankPerTitle
1660610,tvSeries,tt5687612,UK,3887578,Fleabag,2016,"Comedy,Drama",1.0


In [62]:
title_to_num_regions_of_origin = title_to_region_of_origin_hard_labels\
    .groupby(['titleType', 'tconst', 'primaryTitle', 'startYear', 'genres'])\
    .agg({'region': 'count'})\
    .reset_index()\
    .rename(columns={'region': 'numRegionsOfOrigin'})\
    .merge(ratings[['tconst', 'numVotes']], on='tconst')

stats = title_to_num_regions_of_origin\
    .groupby(['titleType', 'numRegionsOfOrigin'])\
    .agg({'tconst': 'count', 'numVotes': 'sum'})\
    .reset_index()\
    .rename(columns={'tconst': 'numTitles'})

stats['PercentageByCount'] = 100 * stats['numTitles'] / stats.groupby('titleType')['numTitles'].transform('sum')
stats['PercentageByVotes'] = 100 * stats['numVotes'] / stats.groupby('titleType')['numVotes'].transform('sum')

stats

Unnamed: 0,titleType,numRegionsOfOrigin,numTitles,numVotes,PercentageByCount,PercentageByVotes
0,movie,1,218257,1102053655,94.998847,99.192739
1,movie,2,8465,5144102,3.684488,0.463006
2,movie,3,1746,1571727,0.759966,0.141467
3,movie,4,701,632734,0.305118,0.056951
4,movie,5,325,564412,0.14146,0.050801
5,movie,6,149,371522,0.064854,0.03344
6,movie,7,104,684355,0.045267,0.061597
7,tvSeries,1,49790,136955635,95.503894,98.334164
8,tvSeries,2,1459,1278344,2.798558,0.917851
9,tvSeries,3,434,245275,0.83247,0.176107


In [63]:
# Nice! Almost all movies and shows are assigned a unique region of origin
# now top titles per region of origin should vary across regions. Lets
# see if thats the case

In [70]:
region_of_origin_per_title = title_to_region_of_origin_hard_labels\
    [['tconst', 'region']]\
    .rename(columns = {'region': 'RegionOfOrigin'})

top_titles_per_region = title_basics\
    .merge(region_of_origin_per_title, on='tconst')\
    .merge(ratings, on='tconst')
    
top_titles_per_region['Rank'] = top_titles_per_region\
    .groupby(['RegionOfOrigin', 'titleType'])['numVotes']\
    .rank(method='dense', ascending=False)

top_titles_per_region = top_titles_per_region.query('Rank <= 20')

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(top_titles_per_region.sort_values(by=['RegionOfOrigin', 'titleType', 'Rank']))

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,RegionOfOrigin,averageRating,numVotes,Rank
200155,tt2119532,movie,Hacksaw Ridge,0,2016,139,"Biography,Drama,History",Australia,8.1,592820,1.0
209995,tt2321549,movie,The Babadook,0,2014,94,"Drama,Horror,Mystery",Australia,6.8,246902,2.0
154871,tt13345606,movie,Evil Dead Rise,0,2023,96,Horror,Australia,6.5,136089,3.0
111770,tt0455824,movie,Australia,0,2008,165,"Adventure,Drama,Romance",Australia,6.6,129687,4.0
138462,tt11464826,movie,The Social Dilemma,0,2020,94,"Documentary,Drama",Australia,7.6,89351,5.0
168688,tt14992922,movie,The Tinder Swindler,0,2022,114,"Crime,Documentary",Australia,7.1,72893,6.0
200641,tt2125608,movie,Searching for Sugar Man,0,2012,86,"Biography,Documentary,Music",Australia,8.2,72842,7.0
176387,tt1587707,movie,Exit Through the Gift Shop,0,2010,87,"Comedy,Crime,Documentary",Australia,7.9,68778,8.0
118667,tt0815241,movie,Religulous,0,2008,101,"Comedy,Documentary,War",Australia,7.6,60227,9.0
273482,tt6333060,movie,Icarus,0,2017,120,"Documentary,Sport",Australia,7.9,54630,10.0


In [72]:
# Yay! Results are looking good. At least for India and Japan, and to some extent france and uk too
# They still look weird for Australia and Canada but its hard to say. May be they are good too

# save the region of origin labels for later use
region_of_origin_per_title\
    .to_csv('data/region_of_origin_per_title.tsv', index=False, sep='\t')