In [1]:
import pandas as pd
import numpy as np

## 0. import data

In [3]:
# import imbd data
imdb = pd.read_csv("imbd_movies_with_rating.csv")
imdb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,avrg_rating,num_votes
0,tt0016906,movie,Frivolinas,Frivolinas,0,2014,80.0,"Comedy,Musical",5.6,14.0
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118.0,"Comedy,Fantasy,Romance",6.4,75863.0
2,tt0064322,movie,The Woman with the Knife,La femme au couteau,0,2010,80.0,"Drama,Thriller",6.6,9.0
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122.0,Drama,6.9,5218.0
4,tt0069204,movie,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,,"Comedy,Drama",6.1,14.0


In [5]:
#import boxoffice data
boxoffice = pd.read_csv("box_offices_mojo.csv")
boxoffice.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Worldwide,Domestic
0,0,Mission: Impossible II,2000,546388105,215409889.0
1,1,Gladiator,2000,460583960,187705427.0
2,2,Cast Away,2000,429632142,233632142.0
3,3,What Women Want,2000,374111707,182811707.0
4,4,Dinosaur,2000,349822765,137748063.0


## 1. clean and rename

In [6]:
# delete unnamed column
boxoffice.drop(columns = "Unnamed: 0",inplace=True)

In [7]:
# rename columns
boxoffice.columns = ["title","year","worldwide","domestic"]

In [8]:
# clean and rename imdb
imdb.drop(columns = "originalTitle",inplace=True)
imdb.drop(columns = "titleType",inplace=True)
imdb.columns = ["imdb_id","title","isAdult","year","minutes","genres","imdb_av_rating","imdb_num_votes"]

## 2. check for different movies with same title

### 2.1 IMDB

In [9]:
# title duplicates
sum(imdb["title"].value_counts()[imdb["title"].value_counts() > 1])

30603

In [10]:
# duplicates for tile and year  
imdb["year_title"] = imdb["year"].astype(str).str.lower() + " " + imdb["title"].astype(str).str.lower()

# drop_duplicates
duplicate_year_title = list(imdb["year_title"].value_counts()[imdb["year_title"].value_counts() > 1].index)
imdb = imdb[~imdb["year_title"].isin(duplicate_year_title)]

len(duplicate_year_title)

2607

### 2.2 box office

In [11]:
# title duplicates
sum(boxoffice["title"].value_counts()[boxoffice["title"].value_counts() > 1])

392

In [12]:
# duplicates for tile and year  
boxoffice["year_title"] = boxoffice["year"].astype(str).str.lower() + " " + boxoffice["title"].astype(str).str.lower()

In [13]:
duplicate_year_title = list(boxoffice["year_title"].value_counts()[boxoffice["year_title"].value_counts() > 1].index)
boxoffice = boxoffice[~boxoffice["year_title"].isin(duplicate_year_title)]
#boxoffice.drop(columns = "year_title",inplace=True)

In [14]:
len(duplicate_year_title)

6

In [15]:
# restet indices
imdb.reset_index(inplace=True)
boxoffice.reset_index(inplace=True)

## 3. merging imdb and boxoffice

In [16]:
# merge
imdb_box = imdb.merge(boxoffice,on="year_title")
# rename and drop columns
imdb_box.drop(columns=["title_y","year_y"],inplace=True)
imdb_box.rename(columns = {"title_x":"title","year_x":"year"},inplace=True)
# save to csv
#imdb_box.drop(columns = "year_title").to_csv("imdb_box_merge.csv")

## 4. merging with tmdb (api)

In [17]:
tmdb = pd.read_csv("tmdb.csv")

In [18]:
tmdb.head()

Unnamed: 0.1,Unnamed: 0,imdb_id,tmdb_rating,tmdb_votes,popularity
0,0,tt0035423,6.2,806,10.179
1,1,tt0113026,5.6,17,2.73
2,2,tt0116748,0.0,0,0.6
3,3,tt0118589,3.7,66,4.855
4,4,tt0118694,8.1,964,13.184


In [19]:
imdb_tmdb_box = imdb_box.merge(tmdb, on="imdb_id")

In [21]:
imdb_tmdb_box.head()

Unnamed: 0.1,index_x,imdb_id,title,isAdult,year,minutes,genres,imdb_av_rating,imdb_num_votes,year_title,index_y,worldwide,domestic,Unnamed: 0,tmdb_rating,tmdb_votes,popularity
0,1,tt0035423,Kate & Leopold,0,2001,118.0,"Comedy,Fantasy,Romance",6.4,75863.0,2001 kate & leopold,432,76019048,47121859.0,0,6.2,806,10.179
1,17,tt0113026,The Fantasticks,0,2000,86.0,"Musical,Romance",5.6,1086.0,2000 the fantasticks,334,49666,49666.0,1,5.6,17,2.73
2,27,tt0116748,Karobaar: The Business of Love,0,2000,180.0,"Drama,Romance",4.5,211.0,2000 karobaar: the business of love,337,45200,45200.0,2,0.0,0,0.6
3,36,tt0118589,Glitter,0,2001,104.0,"Drama,Music,Romance",2.2,21033.0,2001 glitter,562,5271666,4274407.0,3,3.7,66,4.855
4,38,tt0118694,In the Mood for Love,0,2000,98.0,"Drama,Romance",8.1,111412.0,2000 in the mood for love,138,12854953,2738980.0,4,8.1,964,13.184


In [24]:
imdb_tmdb_box = imdb_tmdb_box.drop(columns=["index_x", "year_title", "index_y", "Unnamed: 0"])

In [26]:
imdb_tmdb_box

Unnamed: 0,imdb_id,title,isAdult,year,minutes,genres,imdb_av_rating,imdb_num_votes,worldwide,domestic,tmdb_rating,tmdb_votes,popularity
0,tt0035423,Kate & Leopold,0,2001,118.0,"Comedy,Fantasy,Romance",6.4,75863.0,76019048,47121859.0,6.2,806,10.179
1,tt0113026,The Fantasticks,0,2000,86.0,"Musical,Romance",5.6,1086.0,49666,49666.0,5.6,17,2.730
2,tt0116748,Karobaar: The Business of Love,0,2000,180.0,"Drama,Romance",4.5,211.0,45200,45200.0,0.0,0,0.600
3,tt0118589,Glitter,0,2001,104.0,"Drama,Music,Romance",2.2,21033.0,5271666,4274407.0,3.7,66,4.855
4,tt0118694,In the Mood for Love,0,2000,98.0,"Drama,Romance",8.1,111412.0,12854953,2738980.0,8.1,964,13.184
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7963,tt9826484,Trinity Seven The Movie 2: Heavens Library & C...,0,2019,63.0,"Action,Animation,Comedy",6.8,64.0,15712,15712.0,7.1,5,2.478
7964,tt9844368,Out of Liberty,0,2019,111.0,"Drama,Western",4.5,213.0,265279,265279.0,10.0,1,6.427
7965,tt9845398,End of the Century,0,2019,84.0,Drama,7.5,526.0,64880,64880.0,6.1,6,1.348
7966,tt9896916,Pilgrim's Progress,0,2019,108.0,"Adventure,Animation,Family",5.2,294.0,3173282,1294596.0,7.0,3,2.522


In [27]:
imdb_tmdb_box.to_csv("imdb_tmdb_box.csv")