In [171]:
import pandas as pd
import numpy as np


## 0. import data

In [172]:
# import imbd data
imdb = pd.read_csv("data/imbd_movies_with_rating.csv")
imdb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,avrg_rating,num_votes
0,tt0016906,movie,Frivolinas,Frivolinas,0,2014,80.0,"Comedy,Musical",5.6,14.0
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118.0,"Comedy,Fantasy,Romance",6.4,75863.0
2,tt0064322,movie,The Woman with the Knife,La femme au couteau,0,2010,80.0,"Drama,Thriller",6.6,9.0
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122.0,Drama,6.9,5218.0
4,tt0069204,movie,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,,"Comedy,Drama",6.1,14.0


In [173]:
#import boxoffice data
boxoffice = pd.read_csv("data/box_offices_mojo.csv")

In [174]:
boxoffice.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Worldwide,Domestic
0,0,Mission: Impossible II,2000,546388105,215409889.0
1,1,Gladiator,2000,460583960,187705427.0
2,2,Cast Away,2000,429632142,233632142.0
3,3,What Women Want,2000,374111707,182811707.0
4,4,Dinosaur,2000,349822765,137748063.0


## 1. clean and rename

In [175]:
# delete unnamed column
boxoffice.drop(columns = "Unnamed: 0",inplace=True)

In [176]:
# rename columns
boxoffice.columns = ["title","year","worldwide","domestic"]

In [177]:
# clean and rename imdb
imdb.drop(columns = "originalTitle",inplace=True)
imdb.drop(columns = "titleType",inplace=True)
imdb.columns = ["imdb_id","title","isAdult","year","minutes","genres","imdb_av_rating","imdb_num_votes"]

In [178]:
imdb

Unnamed: 0,imdb_id,title,isAdult,year,minutes,genres,imdb_av_rating,imdb_num_votes
0,tt0016906,Frivolinas,0,2014,80.0,"Comedy,Musical",5.6,14.0
1,tt0035423,Kate & Leopold,0,2001,118.0,"Comedy,Fantasy,Romance",6.4,75863.0
2,tt0064322,The Woman with the Knife,0,2010,80.0,"Drama,Thriller",6.6,9.0
3,tt0069049,The Other Side of the Wind,0,2018,122.0,Drama,6.9,5218.0
4,tt0069204,Sabse Bada Sukh,0,2018,,"Comedy,Drama",6.1,14.0
...,...,...,...,...,...,...,...,...
226571,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57.0,Documentary,,
226572,tt9916680,De la ilusión al desconcierto: cine colombiano...,0,2007,100.0,Documentary,,
226573,tt9916706,Dankyavar Danka,0,2013,,Comedy,,
226574,tt9916730,6 Gunn,0,2017,116.0,\N,,


## 2. check for different movies with same title

### 2.1 IMDB

In [179]:
# title duplicates
sum(imdb["title"].value_counts()[imdb["title"].value_counts() > 1])

30603

In [180]:
# duplicates for tile and year  
imdb["year_title"] = imdb["year"].astype(str).str.lower() + " " + imdb["title"].astype(str).str.lower()

# drop_duplicates
duplicate_year_title = list(imdb["year_title"].value_counts()[imdb["year_title"].value_counts() > 1].index)
imdb = imdb[~imdb["year_title"].isin(duplicate_year_title)]

len(duplicate_year_title)


2607

### 2.2 box office

In [183]:
# title duplicates
sum(boxoffice["title"].value_counts()[boxoffice["title"].value_counts() > 1])

392

In [184]:
# duplicates for tile and year  
boxoffice["year_title"] = boxoffice["year"].astype(str).str.lower() + " " + boxoffice["title"].astype(str).str.lower()

In [185]:
duplicate_year_title = list(boxoffice["year_title"].value_counts()[boxoffice["year_title"].value_counts() > 1].index)
boxoffice = boxoffice[~boxoffice["year_title"].isin(duplicate_year_title)]
#boxoffice.drop(columns = "year_title",inplace=True)

In [186]:
len(duplicate_year_title)

6

In [187]:
# restet indices
imdb.reset_index()
boxoffice.reset_index()

Unnamed: 0,index,title,year,worldwide,domestic,year_title
0,0,Mission: Impossible II,2000,546388105,215409889.0,2000 mission: impossible ii
1,1,Gladiator,2000,460583960,187705427.0,2000 gladiator
2,2,Cast Away,2000,429632142,233632142.0,2000 cast away
3,3,What Women Want,2000,374111707,182811707.0,2000 what women want
4,4,Dinosaur,2000,349822765,137748063.0,2000 dinosaur
...,...,...,...,...,...,...
12482,12494,Jimi Hendrix Electric Church,2019,1818,,2019 jimi hendrix electric church
12483,12495,Game Day,2019,1624,1624.0,2019 game day
12484,12496,The Hours and Times,2019,1273,1273.0,2019 the hours and times
12485,12497,The Untold Story,2019,790,790.0,2019 the untold story


## 3. merging imdb and boxoffice

In [188]:
# merge
imdb_box = imdb.merge(boxoffice,on="year_title")
# rename and drop columns
imdb_box.drop(columns=["title_y","year_y"],inplace=True)
imdb_box.rename(columns = {"title_x":"title","year_x":"year"},inplace=True)
# save to csv
imdb_box.drop(columns = "year_title").to_csv("imdb_box_merge.csv")

In [189]:
imdb_box.drop(columns=["title_y","year_y"],inplace=True)

In [190]:
imdb_box.rename(columns = {"title_x":"title","year_x":"year"},inplace=True)

In [191]:
imdb_box.drop(columns = "year_title").to_csv("imdb_box_merge.csv")

8058

8058

## trying with fuzzy search

In [153]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [192]:
# get list of year-title from boxoffice that could not be merged
titles_in_merge = list(imdb_box["title"].values)
box_titles_notin_merge = list(boxoffice["title"][~boxoffice["title"].isin(titles_in_merge)].values)

In [196]:
test_df = pd.DataFrame(columns = ["box","sug_imdb_1","sug_imdb_2","sug_imdb_3"])
h_count = 1
for title in box_titles_notin_merge:
    suggestion = process.extract(title, imdb["title"], scorer=fuzz.token_sort_ratio, limit=3)
    test_df = test_df.append({"box": title
                    ,"sug_imdb_1" : suggestion[0][0]
                    ,"sug_imdb_2"  : suggestion[1][0]
                    ,"sug_imdb_3"  : suggestion[2][0]},ignore_index=True)
    h_count += 1
    if h_count > 10:
        break

In [198]:
process.extract("pokemon", imdb["title"], scorer=fuzz.ratio, limit=3)

[('Goemon', 77, 41797), ('Spoken', 77, 124345), ('Fukemon', 71, 61335)]

In [197]:
# with fuzz.ration
test_df

Unnamed: 0,box,sug_imdb_1,sug_imdb_2,sug_imdb_3
0,The Beach,The Breach,The Breach,The Bench
1,Pokémon the Movie 2000,Pokémon 3: The Movie,K-On! The Movie,Preto The Movie 2
2,The Exorcist 2000 Director's Cut,The Emerging Past Director's Cut,The Director's Cut,The Director's Cut
3,Fantasia 2000,Fantasia 2,Fantasia,Fantasia
4,The Ninth Gate,The Ninth Day,Into the Game,In the Game
5,Cirque du Soleil: Journey of Man,Cirque du Soleil: O,Cirque du Soleil: The Mystery of Mystere,Cirque du Soleil: Hatching
6,CyberWorld,BearWorld,Cyberon,Cyber Wars
7,Solarmax,Solar,Alarma!,Soorma
8,Godzilla 2000,Godzilla 2001,Godzilla,Dracula 2000
9,Gossip,Ghost Ship,Gloss,Yossi


In [194]:
# token_sort_ratio
test_df

Unnamed: 0,box,sug_imdb
0,The Beach,The Breach
1,Pokémon the Movie 2000,Pokémon 3: The Movie
2,The Exorcist 2000 Director's Cut,The Emerging Past Director's Cut
3,Fantasia 2000,Fantasia 2
4,The Ninth Gate,The Ninth Day
5,Cirque du Soleil: Journey of Man,Cirque du Soleil: O
6,CyberWorld,BearWorld
7,Solarmax,Solar
8,Godzilla 2000,Godzilla 2001
9,Gossip,Ghost Ship


0                                                Frivolinas
1                                            Kate & Leopold
2                                  The Woman with the Knife
3                                The Other Side of the Wind
4                                           Sabse Bada Sukh
                                ...                        
226571          Rodolpho Teóphilo - O Legado de um Pioneiro
226572    De la ilusión al desconcierto: cine colombiano...
226573                                      Dankyavar Danka
226574                                               6 Gunn
226575                       Chico Albuquerque - Revelações
Name: title, Length: 221106, dtype: object

In [32]:
process.extract("Pokémon the Movie 2000", imdb["title"], scorer=fuzz.ratio, limit=10)

[('Pokémon 3: The Movie', 81, 2074),
 ('Preto The Movie 2', 72, 182864),
 ('Good Humor: The Movie 2001', 71, 10023),
 ('K-On! The Movie', 70, 88018),
 ('Momok: The Movie', 68, 69027),
 ('Konbai the Movie', 68, 126068),
 ('Pokémon the Movie: I Choose You!', 68, 189659),
 ('Pokémon Heroes', 67, 11310),
 ('Boo! The Movie', 67, 16879),
 ('One: The Movie', 67, 26473)]

In [41]:
process.extract("Pokémon the Movie 2000", imdb["title"], scorer=fuzz.partial_ratio, limit=10)

[('O', 100, 609),
 ('H', 100, 7496),
 ('K', 100, 10185),
 ('P', 100, 21509),
 ('E', 100, 32549),
 ('VI', 100, 35219),
 ('Mo', 100, 35709),
 ('Movie', 100, 39203),
 ('M', 100, 47528),
 ('2', 100, 52253)]

In [42]:
imdb.loc[609]

imdb_id                        tt0184791
title                                  O
isAdult                                0
year                                2001
minutes                               95
genres            Drama,Romance,Thriller
imdb_av_rating                       6.1
imdb_num_votes                     18774
year_title                         2001o
Name: 609, dtype: object

In [107]:
process.extract("Pokémon the Movie 2000", imdb["title"], scorer=fuzz.token_sort_ratio, limit=1)[0][0]

'Pokémon 3: The Movie'

In [105]:
process.extract("Pokémon the Movie 2000", imdb["title"], scorer=fuzz.token_set_ratio, limit=10)

[('Movie', 100, 39203),
 ('The the the', 100, 70457),
 ('Movie', 100, 85848),
 ('Movie', 100, 142779),
 ('Movie', 100, 160257),
 ('The...', 100, 178885),
 ('Pokémon 3: The Movie', 94, 2074),
 ('W the Movie', 90, 64685),
 ('The U Movie', 90, 75744),
 ('The 4 Movie', 90, 125917)]

In [37]:
imdb.loc[imdb["title"] == "Pokémon 3: The Movie"]

Unnamed: 0,imdb_id,title,isAdult,year,minutes,genres,imdb_av_rating,imdb_num_votes,year_title
2074,tt0235679,Pokémon 3: The Movie,0,2000,93.0,"Action,Adventure,Animation",5.8,12230.0,2000pokémon 3: the movie


In [40]:
imdb.loc[2074]

imdb_id                            tt0235679
title                   Pokémon 3: The Movie
isAdult                                    0
year                                    2000
minutes                                   93
genres            Action,Adventure,Animation
imdb_av_rating                           5.8
imdb_num_votes                         12230
year_title          2000pokémon 3: the movie
Name: 2074, dtype: object

In [181]:
titles_in_merge = list(imdb_box["title_x"].values)

In [182]:
boxoffice["title"][~boxoffice["title"].isin(titles_in_merge)]

31                 Pokémon the Movie 2000
38       The Exorcist 2000 Director's Cut
50                          Fantasia 2000
70                         The Ninth Gate
107      Cirque du Soleil: Journey of Man
                       ...               
12490       I Lost Albert 2019 Re-release
12493                           Jihadists
12494        Jimi Hendrix Electric Church
12495                            Game Day
12496                 The Hours and Times
Name: title, Length: 4145, dtype: object

In [187]:
imdb[["title","year"]][imdb["title"].str.contains("Pok")]

Unnamed: 0,title,year
287,Four Dogs Playing Poker,2000
2074,Pokémon 3: The Movie,2000
3615,Poklonnik,2001
5878,Pokémon 4Ever,2001
5992,Poker,2001
...,...,...
193001,Pokemon Deo Mu-bi XY&Z Bolkenion: Gigyewangguk...,2016
204508,Poka Messiah,2016
209537,Pokémon the Movie: The Power of Us,2018
216990,Prema Pokiri,2007


## 3. merging imdb and boxoffice

In [171]:
imdb_box = imdb.merge(boxoffice,on="year_title")

In [172]:
imdb_box 

Unnamed: 0,imdb_id,title_x,isAdult,year_x,minutes,genres,imdb_av_rating,imdb_num_votes,year_title,title_y,year_y,worldwide,domestic
0,tt0035423,Kate & Leopold,0,2001,118.0,"Comedy,Fantasy,Romance",6.4,75863.0,2001kate & leopold,Kate & Leopold,2001,76019048,47121859.0
1,tt0113026,The Fantasticks,0,2000,86.0,"Musical,Romance",5.6,1086.0,2000the fantasticks,The Fantasticks,2000,49666,49666.0
2,tt0116748,Karobaar: The Business of Love,0,2000,180.0,"Drama,Romance",4.5,211.0,2000karobaar: the business of love,Karobaar: The Business of Love,2000,45200,45200.0
3,tt0118589,Glitter,0,2001,104.0,"Drama,Music,Romance",2.2,21033.0,2001glitter,Glitter,2001,5271666,4274407.0
4,tt0118694,In the Mood for Love,0,2000,98.0,"Drama,Romance",8.1,111412.0,2000in the mood for love,In the Mood for Love,2000,12854953,2738980.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8505,tt9844368,Out of Liberty,0,2019,111.0,"Drama,Western",4.5,213.0,2019out of liberty,Out of Liberty,2019,265279,265279.0
8506,tt9845398,End of the Century,0,2019,84.0,Drama,7.5,526.0,2019end of the century,End of the Century,2019,64880,64880.0
8507,tt9856680,Puffs: Filmed Live Off Broadway,0,2018,118.0,"Adventure,Comedy",8.6,17.0,2018puffs: filmed live off broadway,Puffs: Filmed Live Off Broadway,2018,464483,464483.0
8508,tt9896916,Pilgrim's Progress,0,2019,108.0,"Adventure,Animation,Family",5.2,294.0,2019pilgrim's progress,Pilgrim's Progress,2019,3173282,1294596.0


In [16]:
imbd[imbd["primaryTitle"]=="Homecoming"]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,avrg_rating,num_votes
14280,tt0378057,movie,Homecoming,Homecoming,0,2004,,Documentary,8.6,18.0
15852,tt0392121,movie,Homecoming,Homecoming,0,2003,100.0,Drama,5.3,16.0
18291,tt0418735,movie,Homecoming,Homecoming,0,2005,109.0,Drama,7.4,36.0
20175,tt0436399,movie,Homecoming,Homecoming,0,2004,104.0,Drama,6.9,44.0
46910,tt1093366,movie,Homecoming,Homecoming,0,2005,90.0,Drama,5.4,5.0
50747,tt11289634,movie,Homecoming,Homecoming,0,2019,,Drama,,
51438,tt1135500,movie,Homecoming,Homecoming,0,2009,88.0,"Drama,Horror,Sport",5.3,4054.0
74360,tt1618376,movie,Homecoming,Homecoming,0,2011,,"Comedy,Drama",7.2,42.0
80360,tt1753913,movie,Homecoming,Hômukamingu,0,2011,93.0,Comedy,6.0,5.0
81824,tt1779070,movie,Homecoming,Homecoming,0,2010,46.0,"Comedy,Drama,Family",,


In [33]:
duplicate_titles = list(imbd["primaryTitle"].value_counts()[imbd["primaryTitle"].value_counts() > 1].index)

In [37]:
imbd_no_dup_titles = imbd[~imbd["primaryTitle"].isin(duplicate_titles)].copy()

In [42]:
imbd_no_dup_titles["primaryTitle"].value_counts()

195973

In [39]:
imbd

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,avrg_rating,num_votes
0,tt0016906,movie,Frivolinas,Frivolinas,0,2014,80.0,"Comedy,Musical",5.6,14.0
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118.0,"Comedy,Fantasy,Romance",6.4,75863.0
2,tt0064322,movie,The Woman with the Knife,La femme au couteau,0,2010,80.0,"Drama,Thriller",6.6,9.0
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122.0,Drama,6.9,5218.0
4,tt0069204,movie,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,,"Comedy,Drama",6.1,14.0
...,...,...,...,...,...,...,...,...,...,...
226571,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57.0,Documentary,,
226572,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,100.0,Documentary,,
226573,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,,Comedy,,
226574,tt9916730,movie,6 Gunn,6 Gunn,0,2017,116.0,\N,,


In [47]:
imdb_box_merge= imbd_no_dup_titles.merge(boxoffice,left_on="primaryTitle",right_on="Title")

In [45]:
sum(boxoffice["Title"].value_counts()>1)

195

In [48]:
sum(imdb_box_merge["Title"].value_counts()>1)

10

In [50]:
sum(~imdb_box_merge["avrg_rating"].isna())

8953

In [158]:
imdb_box_merge

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,avrg_rating,num_votes,Title,Year,Worldwide,Domestic
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118.0,"Comedy,Fantasy,Romance",6.4,75863.0,Kate & Leopold,2001,76019048,47121859.0
1,tt0100275,movie,The Wandering Soap Opera,La Telenovela Errante,0,2017,80.0,"Comedy,Drama,Fantasy",6.6,209.0,The Wandering Soap Opera,2019,3624,3624.0
2,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000,86.0,"Musical,Romance",5.6,1086.0,The Fantasticks,2000,49666,49666.0
3,tt0116748,movie,Karobaar: The Business of Love,Karobaar: The Business of Love,0,2000,180.0,"Drama,Romance",4.5,211.0,Karobaar: The Business of Love,2000,45200,45200.0
4,tt0118589,movie,Glitter,Glitter,0,2001,104.0,"Drama,Music,Romance",2.2,21033.0,Glitter,2001,5271666,4274407.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9001,tt9825006,movie,Avant qu'on explose,Avant qu'on explose,0,2019,108.0,Comedy,6.6,80.0,Avant qu'on explose,2019,119894,119894.0
9002,tt9826484,movie,Trinity Seven The Movie 2: Heavens Library & C...,Trinity Seven: Heavens Library & Crimson Lord,0,2019,63.0,"Action,Animation,Comedy",6.8,64.0,Trinity Seven The Movie 2: Heavens Library & C...,2019,15712,15712.0
9003,tt9844368,movie,Out of Liberty,Out of Liberty,0,2019,111.0,"Drama,Western",4.5,213.0,Out of Liberty,2019,265279,265279.0
9004,tt9856680,movie,Puffs: Filmed Live Off Broadway,Puffs: Filmed Live Off Broadway,0,2018,118.0,"Adventure,Comedy",8.6,17.0,Puffs: Filmed Live Off Broadway,2018,464483,464483.0
