# Improving Merge

Merging IMDB Core with ticket sales data has to rely on title and year match. 
After we fixed issues with movies being released in the same year under the same title, we faced a new one. 
<br><br>
There are numerous titles with slightly different names. 
Roughly 1k per data set don't match. 
<br><br>
Next goal is to analyze the mismatch and correct that. 

### 1. Importing packages and data from sql

In [89]:
import pandas as pd
import numpy as np
import psycopg2 as psycopg2
import sql_functions as sqlf

In [90]:
schema = "capstone_24_4_group1"
schema

'capstone_24_4_group1'

In [91]:
imdb_query = f'''   SELECT *
                    FROM {schema}."IMDB_data"
                    '''

eu_query = f'''   SELECT *
                    FROM {schema}."movie_data_EU"
                    '''

na_query = f'''   SELECT *
                    FROM {schema}."movie_data_NA"
                    '''

In [92]:
imdb_df = sqlf.get_dataframe(imdb_query)
display(imdb_df.head())
imdb_df.shape

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult
0,tt0013274,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,2021,94,73.0,6.7,1.0,Documentary,,...,,2.0,Nikolai Izvolov,Dziga Vertov,,,,,,0
1,tt0015414,La tierra de los toros,La tierra de los toros,2000,60,17.0,5.4,,,,...,,1.0,Musidora,,,,,,,0
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944.0,6.4,3.0,Comedy,Fantasy,...,Natasha Lyonne,1.0,James Mangold,,,2.0,Steven Rogers,James Mangold,,0
3,tt0062336,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,2020,70,190.0,6.5,1.0,Drama,,...,Luis Vilches,2.0,Raúl Ruiz,Valeria Sarmiento,,2.0,Raúl Ruiz,Omar Saavedra Santis,,0
4,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122,8143.0,6.7,1.0,Drama,,...,Norman Foster,1.0,Orson Welles,,,2.0,Orson Welles,Oja Kodar,,0


(185273, 25)

In [93]:
eu_df = sqlf.get_dataframe(eu_query)
display(eu_df.head())
eu_df.shape

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold
0,(500) Days of Summer,US,2009,1713086,1684771
1,(Nie)znajomi,PL,2019,685075,684833
2,(T)Raumschiff Surprise - Periode 1,DE,2004,10763531,10731881
3,1 1/2 Ritter - Auf der Suche nach der hinreiße...,DE,2008,1986168,1986168
4,1 chance sur 2,FR,1998,1295620,1238175


(4956, 5)

In [94]:
na_df = sqlf.get_dataframe(na_query)
display(na_df.head())
na_df.shape

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,release_year
0,(500) Days of Summer,2009-08-07,Fox Searchlight,32425665,4323422,2009
1,10 Cloverfield Lane,2016-03-11,Paramount Pictures,72082999,8333294,2016
2,10 Things I Hate About You,1999-03-31,Walt Disney,38177966,7515347,1999
3,"10,000 B.C.",2008-03-07,Warner Bros.,94784201,13201142,2008
4,101 Dalmatians,1996-11-27,Walt Disney,136189294,30691447,1996


(4965, 6)

## 2. Get not matching movies
### EU

In [95]:
eu_unmatched_df = pd.merge(eu_df,imdb_df, how="left", left_on=["title", "year"], right_on=["original_title", "year"])
eu_unmatched_df.shape

(4959, 29)

In [96]:
eu_unmatched_mask = eu_unmatched_df["tconst"].isnull()

In [97]:
eu_unmatched_df[eu_unmatched_mask]

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,tconst,primary_title,original_title,runtime,num_votes,...,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult
13,101 Dalmatians,US,1996,21623260,21532085,,,,,,...,,,,,,,,,,
20,13 Going On 30,US,2004,3579724,3566771,,,,,,...,,,,,,,,,,
22,15 Minutes (Fifteen Minutes),US,2001,2600419,2565397,,,,,,...,,,,,,,,,,
41,23,DE,1998,701787,693358,,,,,,...,,,,,,,,,,
47,28 Days Later,GB,2002,4252690,4066710,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4949,Çok filim hareketler bunlar,TR,2010,1223552,1223156,,,,,,...,,,,,,,,,,
4955,Účastníci zájezdu,CZ,2006,871439,830242,,,,,,...,,,,,,,,,,
4956,Śluby panieńskie,PL,2010,1001866,1000373,,,,,,...,,,,,,,,,,
4957,Świadectwo,PL,2008,1039901,1034911,,,,,,...,,,,,,,,,,


currently 922 unmatched rows

### Let's check how much lower case improves (add for EU and NA)

In [175]:
imdb_df["original_title_merge"] = imdb_df["original_title"].str.lower()
eu_df["title_merge"] = eu_df["title"].str.lower()
na_df["title_merge"] = na_df["title"].str.lower()
display(imdb_df.head())
display(eu_df.head())
display(na_df.head())

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
0,tt0013274,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,2021,94,73.0,6.7,1.0,Documentary,,...,2.0,Nikolai Izvolov,Dziga Vertov,,,,,,0,istoriya grazhdanskoy voyny
1,tt0015414,La tierra de los toros,La tierra de los toros,2000,60,17.0,5.4,,,,...,1.0,Musidora,,,,,,,0,la tierra de los toros
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944.0,6.4,3.0,Comedy,Fantasy,...,1.0,James Mangold,,,2.0,Steven Rogers,James Mangold,,0,kate & leopold
3,tt0062336,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,2020,70,190.0,6.5,1.0,Drama,,...,2.0,Raúl Ruiz,Valeria Sarmiento,,2.0,Raúl Ruiz,Omar Saavedra Santis,,0,el tango del viudo y su espejo deformante
4,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122,8143.0,6.7,1.0,Drama,,...,1.0,Orson Welles,,,2.0,Orson Welles,Oja Kodar,,0,the other side of the wind


Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge
0,(500) Days of Summer,US,2009,1713086,1684771,(500) days of summer
1,(Nie)znajomi,PL,2019,685075,684833,(nie)znajomi
2,(T)Raumschiff Surprise - Periode 1,DE,2004,10763531,10731881,(t)raumschiff surprise - periode 1
3,1 1/2 Ritter - Auf der Suche nach der hinreiße...,DE,2008,1986168,1986168,1 1/2 ritter - auf der suche nach der hinreiße...
4,1 chance sur 2,FR,1998,1295620,1238175,1 chance sur 2


Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,release_year,title_merge
0,(500) Days of Summer,2009-08-07,Fox Searchlight,32425665,4323422,2009,(500) days of summer
1,10 Cloverfield Lane,2016-03-11,Paramount Pictures,72082999,8333294,2016,10 cloverfield lane
2,10 Things I Hate About You,1999-03-31,Walt Disney,38177966,7515347,1999,10 things i hate about you
3,"10,000 B.C.",2008-03-07,Warner Bros.,94784201,13201142,2008,"10,000 b.c."
4,102 Dalmatians,2000-11-22,Walt Disney,66941559,12343421,2000,102 dalmatians


In [99]:
eu_check_df = pd.merge(eu_df,imdb_df, how="left", left_on=["title_merge", "year"], right_on=["original_title_merge", "year"])

In [100]:
eu_check_mask = eu_check_df["tconst"].isnull()
eu_check_df[eu_check_mask]

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
13,101 Dalmatians,US,1996,21623260,21532085,101 dalmatians,,,,,...,,,,,,,,,,
22,15 Minutes (Fifteen Minutes),US,2001,2600419,2565397,15 minutes (fifteen minutes),,,,,...,,,,,,,,,,
41,23,DE,1998,701787,693358,23,,,,,...,,,,,,,,,,
47,28 Days Later,GB,2002,4252690,4066710,28 days later,,,,,...,,,,,,,,,,
59,"4 luni, 3 saptamani si 2 zile",RO,2007,1090696,736957,"4 luni, 3 saptamani si 2 zile",,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4930,[Rec] 2,ES,2009,1387671,1013091,[rec] 2,,,,,...,,,,,,,,,,
4955,Účastníci zájezdu,CZ,2006,871439,830242,účastníci zájezdu,,,,,...,,,,,,,,,,
4956,Śluby panieńskie,PL,2010,1001866,1000373,śluby panieńskie,,,,,...,,,,,,,,,,
4957,Świadectwo,PL,2008,1039901,1034911,świadectwo,,,,,...,,,,,,,,,,


From 922 to 788 unmatched rows
-> 134 less problems

### Check what's wrong with "101 Dalmatians"

In [101]:
imdb_df[imdb_df["tconst"] == "tt0115433"]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge


hmm ... can't find the 101 ... movie ... why? 

In [102]:
imdb_df[imdb_df["original_title"].str.contains("101")].sort_values(by="original_title").head(15)

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
133593,tt3668280,101 Chodhyangal?,101 Chodhyangal?,2013,107,113.0,7.0,2.0,Drama,Family,...,1.0,Sidhartha Siva,,,1.0,Sidhartha Siva,,,0,101 chodhyangal?
142949,tt4512212,101 Reasons: Liberty Lives in New Hampshire,101 Reasons: Liberty Lives in New Hampshire,2014,64,14.0,8.6,2.0,Documentary,News,...,1.0,Beau Davis,,,2.0,Beau Davis,Vince Perfetto,,0,101 reasons: liberty lives in new hampshire
7103,tt0252802,101 Rent Boys,101 Rent Boys,2000,78,384.0,6.5,1.0,Documentary,,...,2.0,Fenton Bailey,Randy Barbato,,,,,,0,101 rent boys
6084,tt0237993,101 Reykjavík,101 Reykjavík,2000,88,9967.0,6.8,3.0,Comedy,Drama,...,1.0,Baltasar Kormákur,,,4.0,Hallgrímur Helgason,Baltasar Kormákur,,0,101 reykjavík
67298,tt14358208,101 Reys,101 Reys,2020,110,20.0,7.8,1.0,Biography,,...,1.0,Akrom Shohnazarov,,,1.0,Akrom Shohnazarov,,,0,101 reys
182812,tt9429520,101 Seconds,101 Seconds,2018,81,29.0,6.2,1.0,Documentary,,...,1.0,Skye Fitzgerald,,,,,,,0,101 seconds
127625,tt3219396,101 Secrets,101 Secrets,2015,95,15.0,5.3,3.0,Adventure,Drama,...,1.0,Tophy Cho,,,1.0,Tophy Cho,,,0,101 secrets
6300,tt0241142,101 Ways (the Things a Girl Will Do to Keep He...,101 Ways (The Things a Girl Will Do to Keep He...,2000,100,162.0,5.2,1.0,Comedy,,...,1.0,Jennifer B. Katz,,,1.0,Jennifer B. Katz,,,0,101 ways (the things a girl will do to keep he...
112875,tt2545176,101 Weddings,101 Weddings,2012,145,235.0,4.7,3.0,Comedy,Drama,...,1.0,Shafi,,,2.0,Kalavoor Ravikumar,Shafi,,0,101 weddings
82210,tt1674766,101 Proposals,101 ci qiu hun,2013,120,526.0,5.4,1.0,Romance,,...,1.0,Leste Chen,,,3.0,Shinji Nojima,Peng Ren,Wei Zhang,0,101 ci qiu hun


### Check if we can find the tconst in the basic data-set

In [103]:
basic_df = pd.read_csv("Data/title.principals/title.basics.csv")
basic_df.shape

(11057208, 9)

In [104]:
basic_df[basic_df["tconst"] == "tt0115433"]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
112776,tt0115433,movie,101 Dalmatians,101 Dalmatians,0.0,1996.0,\N,103,"Adventure,Comedy,Crime"


Is is in there ... that means we drop it unwanted at some stage during the filtering process <br><br>

First idea: Maybe we filter for year > 1996 instead of year >= 1996?

### Solution: 

It is the other way round. We decided to look at the last 25 years (1998-2023) and filtered the IMDB data accordingly. However, the EU and NA data starts at 1996. Those we never filtered for the appropriate date range ... 

Let's correct this ... 

### Filter EU and NA for Year >= 1998

In [105]:
eu_df[eu_df["year"] >= 1998].sort_values(by="year")

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge
770,City of Angels,US,1998,8318763,8271916,city of angels
992,Desperate Measures,US,1998,1194818,1191026,desperate measures
3413,Rush Hour,US,1998,8120318,8007449,rush hour
2251,Lautrec,"FR, ES",1998,650948,560307,lautrec
4748,Virus,"US, GB, JP, DE, FR",1998,1719742,1715116,virus
...,...,...,...,...,...,...
97,A Haunting in Venice,US,2023,6170413,6170413,a haunting in venice
3858,Thanksgiving,"US, CA, AU",2023,1006977,1006977,thanksgiving
2967,"O psie, który jezdzil koleja",PL,2023,730994,730994,"o psie, który jezdzil koleja"
3828,Taylor Swift: The Eras Tour,US,2023,2120352,2120352,taylor swift: the eras tour


In [106]:
eu_df = eu_df[eu_df["year"] >= 1998].reset_index(drop=True)
eu_df

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge
0,(500) Days of Summer,US,2009,1713086,1684771,(500) days of summer
1,(Nie)znajomi,PL,2019,685075,684833,(nie)znajomi
2,(T)Raumschiff Surprise - Periode 1,DE,2004,10763531,10731881,(t)raumschiff surprise - periode 1
3,1 1/2 Ritter - Auf der Suche nach der hinreiße...,DE,2008,1986168,1986168,1 1/2 ritter - auf der suche nach der hinreiße...
4,1 chance sur 2,FR,1998,1295620,1238175,1 chance sur 2
...,...,...,...,...,...,...
4536,Ölümlü Dünya 2,TR,2023,1476943,1476943,ölümlü dünya 2
4537,Účastníci zájezdu,CZ,2006,871439,830242,účastníci zájezdu
4538,Śluby panieńskie,PL,2010,1001866,1000373,śluby panieńskie
4539,Świadectwo,PL,2008,1039901,1034911,świadectwo


In [107]:
na_df[na_df["release_year"] >= 1998].sort_values(by="release_year")


Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,release_year
2214,Little Voice,1998-12-04,Miramax,3714954,731290,1998
1688,Hilary and Jackie,1998-12-30,October Films,4739909,933052,1998
406,Babe: Pig in the City,1998-11-25,Universal,18319860,3870373,1998
4107,The Man in the Iron Mask,1998-03-13,MGM,56968169,12146731,1998
989,Deep Impact,1998-05-08,Paramount Pictures,140464664,29949821,1998
...,...,...,...,...,...,...
2714,Oppenheimer,2023-07-21,Universal,326101370,30250590,2023
4758,Waitress: The Musical,2023-12-07,Bleecker Street,5402148,501126,2023
593,Blue Beetle,2023-08-18,Warner Bros.,72541501,6729267,2023
4116,The Marvels,2023-11-10,Walt Disney,84479155,7836656,2023


In [108]:
na_df = na_df[na_df["release_year"] >= 1998].reset_index(drop=True)
na_df

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,release_year
0,(500) Days of Summer,2009-08-07,Fox Searchlight,32425665,4323422,2009
1,10 Cloverfield Lane,2016-03-11,Paramount Pictures,72082999,8333294,2016
2,10 Things I Hate About You,1999-03-31,Walt Disney,38177966,7515347,1999
3,"10,000 B.C.",2008-03-07,Warner Bros.,94784201,13201142,2008
4,102 Dalmatians,2000-11-22,Walt Disney,66941559,12343421,2000
...,...,...,...,...,...,...
4539,earth,2009-04-22,Walt Disney,32011576,4268210,2009
4540,jackass forever,2022-02-04,Paramount Pictures,57743451,5483709,2022
4541,mother!,2017-09-15,Paramount Pictures,17800004,1984392,2017
4542,xXx,2002-08-09,Sony Pictures,141930000,24428571,2002


### Check Merge with reducced EU Data (4541 rows)

In [109]:
eu_check_df = pd.merge(eu_df,imdb_df, how="left", left_on=["title_merge", "year"], right_on=["original_title_merge", "year"])

In [110]:
eu_check_mask = eu_check_df["tconst"].isnull()
eu_check_df[eu_check_mask]

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
21,15 Minutes (Fifteen Minutes),US,2001,2600419,2565397,15 minutes (fifteen minutes),,,,,...,,,,,,,,,,
40,23,DE,1998,701787,693358,23,,,,,...,,,,,,,,,,
46,28 Days Later,GB,2002,4252690,4066710,28 days later,,,,,...,,,,,,,,,,
58,"4 luni, 3 saptamani si 2 zile",RO,2007,1090696,736957,"4 luni, 3 saptamani si 2 zile",,,,,...,,,,,,,,,,
65,5X2 cinq fois deux,FR,2004,1150178,814942,5x2 cinq fois deux,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515,[Rec] 2,ES,2009,1387671,1013091,[rec] 2,,,,,...,,,,,,,,,,
4540,Účastníci zájezdu,CZ,2006,871439,830242,účastníci zájezdu,,,,,...,,,,,,,,,,
4541,Śluby panieńskie,PL,2010,1001866,1000373,śluby panieńskie,,,,,...,,,,,,,,,,
4542,Świadectwo,PL,2008,1039901,1034911,świadectwo,,,,,...,,,,,,,,,,


We are down to 373 unmatched rows (from 788)
-> improved by 415

### Check Problem with 15Minutes

In [111]:
imdb_df[imdb_df["tconst"] == "tt0179626"]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
2979,tt0179626,15 Minutes,15 Minutes,2001,120,52238.0,6.1,3.0,Action,Crime,...,1.0,John Herzfeld,,,1.0,John Herzfeld,,,0,15 minutes


ok the EU data contains the numbers written out in brackets. The IMDB does not. <br> <br>

Does that happen for other movies, too? 

In [112]:
bracket_mask = eu_check_df["title_merge"].str.contains("\(")
eu_check_df[eu_check_mask & bracket_mask]

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
21,15 Minutes (Fifteen Minutes),US,2001,2600419,2565397,15 minutes (fifteen minutes),,,,,...,,,,,,,,,,
411,Beast (US),"US, IS, JP",2022,1078911,1078729,beast (us),,,,,...,,,,,,,,,,
812,"Dangerous Beauty (The Honest Courtesan, A Dest...",US,1998,929520,928888,"dangerous beauty (the honest courtesan, a dest...",,,,,...,,,,,,,,,,
2809,Paparazzi (FR),FR,1998,994869,988887,paparazzi (fr),,,,,...,,,,,,,,,,
2810,Paparazzi (IT),IT,1998,1604573,1604573,paparazzi (it),,,,,...,,,,,,,,,,


Nop, unique problem ... but Paparazzi also does not match!! <br><br>

Anyway ... Correct 15 Minutes first

In [113]:
eu_df.loc[eu_df["title_merge"] == "15 minutes (fifteen minutes)", "title_merge"]

21    15 minutes (fifteen minutes)
Name: title_merge, dtype: object

In [114]:
eu_df.loc[eu_df["title_merge"] == "15 minutes (fifteen minutes)", "title_merge"] = "15 minutes"
eu_df.loc[eu_df["title_merge"] == "15 minutes"]

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge
21,15 Minutes (Fifteen Minutes),US,2001,2600419,2565397,15 minutes


### repeat with new eu_df and check out Paparazzi problems

In [115]:
eu_check_df = pd.merge(eu_df,imdb_df, how="left", left_on=["title_merge", "year"], right_on=["original_title_merge", "year"])

In [116]:
eu_check_mask = eu_check_df["tconst"].isnull()
eu_check_df[eu_check_mask]

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
40,23,DE,1998,701787,693358,23,,,,,...,,,,,,,,,,
46,28 Days Later,GB,2002,4252690,4066710,28 days later,,,,,...,,,,,,,,,,
58,"4 luni, 3 saptamani si 2 zile",RO,2007,1090696,736957,"4 luni, 3 saptamani si 2 zile",,,,,...,,,,,,,,,,
65,5X2 cinq fois deux,FR,2004,1150178,814942,5x2 cinq fois deux,,,,,...,,,,,,,,,,
67,666 - Traue keinem mit dem du schläfst,DE,2002,677829,677829,666 - traue keinem mit dem du schläfst,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515,[Rec] 2,ES,2009,1387671,1013091,[rec] 2,,,,,...,,,,,,,,,,
4540,Účastníci zájezdu,CZ,2006,871439,830242,účastníci zájezdu,,,,,...,,,,,,,,,,
4541,Śluby panieńskie,PL,2010,1001866,1000373,śluby panieńskie,,,,,...,,,,,,,,,,
4542,Świadectwo,PL,2008,1039901,1034911,świadectwo,,,,,...,,,,,,,,,,


In [117]:
imdb_df[imdb_df["original_title"] == "Paparazzi"]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
885,tt0133314,Paparazzi (FR),Paparazzi,1998,111,1091.0,5.4,2.0,Comedy,Romance,...,1.0,Alain Berbérian,,,6.0,Alain Berbérian,Jean-François Halin,,0,paparazzi
2710,tt0174105,Paparazzi (IT),Paparazzi,1998,100,1156.0,4.0,1.0,Comedy,,...,1.0,Neri Parenti,,,,,,,0,paparazzi
14129,tt0338325,Paparazzi,Paparazzi,2004,84,15776.0,5.7,3.0,Action,Crime,...,1.0,Paul Abascal,,,1.0,Forry Smith,,,0,paparazzi
151004,tt5303564,Paparazzi,Paparazzi,2015,110,54.0,5.4,3.0,Action,Drama,...,1.0,Saad Hendawy,,,1.0,Ahmed Abdel Fattah,,,0,paparazzi


ok, we changed the primary but not the original title ... <br><br>

### Make Beast and Paparazzi correct name in original_title, too.

Paparazzi

In [118]:
imdb_df.loc[imdb_df["original_title"] == "Paparazzi", "primary_title"]

885       Paparazzi (FR)
2710      Paparazzi (IT)
14129          Paparazzi
151004         Paparazzi
Name: primary_title, dtype: object

In [119]:
imdb_df.loc[imdb_df["original_title"] == "Paparazzi", "original_title"] = imdb_df.loc[imdb_df["original_title"] == "Paparazzi", "primary_title"]

In [120]:
imdb_df[imdb_df["original_title"].str.contains("Paparazzi")]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
885,tt0133314,Paparazzi (FR),Paparazzi (FR),1998,111,1091.0,5.4,2.0,Comedy,Romance,...,1.0,Alain Berbérian,,,6.0,Alain Berbérian,Jean-François Halin,,0,paparazzi
2710,tt0174105,Paparazzi (IT),Paparazzi (IT),1998,100,1156.0,4.0,1.0,Comedy,,...,1.0,Neri Parenti,,,,,,,0,paparazzi
14129,tt0338325,Paparazzi,Paparazzi,2004,84,15776.0,5.7,3.0,Action,Crime,...,1.0,Paul Abascal,,,1.0,Forry Smith,,,0,paparazzi
82059,tt1671678,Paparazzi: Full Throttle LA,Paparazzi: Full Throttle LA,2010,62,15.0,6.5,1.0,Documentary,,...,1.0,Daniel Ramos,,,1.0,Daniel Ramos,,,0,paparazzi: full throttle la
88847,tt1836097,Paparazzi Eye in the Dark,Paparazzi Eye in the Dark,2011,142,9.0,6.8,1.0,Mystery,,...,1.0,Bayo Akinfemi,,,1.0,Kojo Edu Ansah,,,0,paparazzi eye in the dark
151004,tt5303564,Paparazzi,Paparazzi,2015,110,54.0,5.4,3.0,Action,Drama,...,1.0,Saad Hendawy,,,1.0,Ahmed Abdel Fattah,,,0,paparazzi


Beast

In [121]:
imdb_df[imdb_df["original_title"] == "Beast"]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
45079,tt11301946,Beast (IN),Beast,2022,155,36815.0,5.2,3.0,Action,Comedy,...,1.0,Nelson Dilipkumar,,,1.0,Nelson Dilipkumar,,,0,beast
58623,tt13223398,Beast (US),Beast,2022,93,43487.0,5.6,3.0,Action,Adventure,...,1.0,Baltasar Kormákur,,,2.0,Jaime Primak Sullivan,Ryan Engle,,0,beast
66379,tt1423333,Beast,Beast,2007,85,14.0,7.0,1.0,Horror,,...,1.0,Jack Bennett,,,1.0,Jack Bennett,,,0,beast
77189,tt1572501,Beast,Beast,2011,83,609.0,5.6,2.0,Drama,Thriller,...,1.0,Christoffer Boe,,,1.0,Christoffer Boe,,,0,beast
140426,tt4251006,Beast,Beast,2015,94,71.0,6.6,3.0,Crime,Drama,...,2.0,Sam McKeith,Tom McKeith,,3.0,Will Howarth,Sam McKeith,Tom McKeith,0,beast
141507,tt4359322,Beast,Beast,2009,87,10.0,5.5,1.0,Horror,,...,1.0,Chris Jupp,,,2.0,Chris Jupp,Michael J. Murphy,,0,beast
153906,tt5628302,Beast,Beast,2017,107,16190.0,6.8,3.0,Crime,Drama,...,1.0,Michael Pearce,,,1.0,Michael Pearce,,,0,beast
162345,tt6463468,Beast,Beast,2018,60,30.0,5.8,2.0,Adventure,Drama,...,1.0,Ben Strang,,,1.0,Ben Strang,,,0,beast


In [122]:
imdb_df.loc[imdb_df["original_title"] == "Beast", "original_title"] = imdb_df.loc[imdb_df["original_title"] == "Beast", "primary_title"]

In [124]:
imdb_df[(imdb_df["original_title"].str.startswith("Beast")) & (imdb_df["year"] == 2022)]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
45079,tt11301946,Beast (IN),Beast (IN),2022,155,36815.0,5.2,3.0,Action,Comedy,...,1.0,Nelson Dilipkumar,,,1.0,Nelson Dilipkumar,,,0,beast
58623,tt13223398,Beast (US),Beast (US),2022,93,43487.0,5.6,3.0,Action,Adventure,...,1.0,Baltasar Kormákur,,,2.0,Jaime Primak Sullivan,Ryan Engle,,0,beast
99668,tt21352688,Beast Mode On,Beast Mode On,2022,85,52.0,6.4,3.0,Biography,Documentary,...,2.0,Julian Alexander Oliver,Najia Khaan,,4.0,Adebayo Akinfenwa,Dele Akinfenwa,,0,beast mode on


Recreate the merging column

In [126]:
imdb_df["original_title_merge"] = imdb_df["original_title"].str.lower()

### Merge again and recheck with Beast and Paparazzi Done

In [127]:
eu_check_df = pd.merge(eu_df,imdb_df, how="left", left_on=["title_merge", "year"], right_on=["original_title_merge", "year"])

In [128]:
eu_check_mask = eu_check_df["tconst"].isnull()
eu_check_df[eu_check_mask]

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
40,23,DE,1998,701787,693358,23,,,,,...,,,,,,,,,,
46,28 Days Later,GB,2002,4252690,4066710,28 days later,,,,,...,,,,,,,,,,
58,"4 luni, 3 saptamani si 2 zile",RO,2007,1090696,736957,"4 luni, 3 saptamani si 2 zile",,,,,...,,,,,,,,,,
65,5X2 cinq fois deux,FR,2004,1150178,814942,5x2 cinq fois deux,,,,,...,,,,,,,,,,
67,666 - Traue keinem mit dem du schläfst,DE,2002,677829,677829,666 - traue keinem mit dem du schläfst,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515,[Rec] 2,ES,2009,1387671,1013091,[rec] 2,,,,,...,,,,,,,,,,
4540,Účastníci zájezdu,CZ,2006,871439,830242,účastníci zájezdu,,,,,...,,,,,,,,,,
4541,Śluby panieńskie,PL,2010,1001866,1000373,śluby panieńskie,,,,,...,,,,,,,,,,
4542,Świadectwo,PL,2008,1039901,1034911,świadectwo,,,,,...,,,,,,,,,,


In [131]:
imdb_df[(imdb_df["original_title"] == "Paparazzi") & eu_check_mask]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge


Down from 372 to 369 (-3 for Paparazzi and Beast)

What the heck is wrong with the turkish titles though?

In [135]:
imdb_df.loc[imdb_df["tconst"] == "tt0795488", "original_title_merge"]

29482    úcastníci zájezdu
Name: original_title_merge, dtype: object

In [146]:
eu_check_df.loc[eu_check_df["title"] == "Účastníci zájezdu", "title_merge"]

4540    účastníci zájezdu
Name: title_merge, dtype: object

In [150]:
imdb_df.loc[imdb_df["tconst"] == "tt1720223", "original_title_merge"]

84070    sluby panienskie
Name: original_title_merge, dtype: object

In [151]:
eu_check_df.loc[eu_check_df["title"] == "Śluby panieńskie", "title_merge"]

4541    śluby panieńskie
Name: title_merge, dtype: object

In [152]:
imdb_df.loc[imdb_df["tconst"] == "tt1627942", "original_title_merge"]

80070    zeny v pokusení
Name: original_title_merge, dtype: object

In [154]:
eu_check_df.loc[eu_check_df["title"] == "Ženy v pokušení", "title_merge"]

4543    ženy v pokušení
Name: title_merge, dtype: object

č, ś, ń, ž are all normal character in the IMDB data

### Change all polish/special charcters in IMDB, EU and NA to standard

polish characters: ą, ć, ę, ł, ń, ó, ś, ź, ż

In [1]:
from unidecode import unidecode

In [2]:
# test new function:
eu_check_df.loc[eu_check_df["title"] == "Ženy v pokušení", "title_merge"]

NameError: name 'eu_check_df' is not defined

In [155]:
polish_dict = {"ą": "a",
               "ć": "c",
               "ę": "ę",
               "ł": "x",
               "ń": "n",
               "ó": "o",
               "ś": "s",
               "ź": "z",
               "ż": "z"}

In [176]:
for key, value in polish_dict.items():
    print(key, value)
    imdb_df["original_title_merge"] = imdb_df["original_title_merge"].str.replace(key, value)
    eu_df["title_merge"] = eu_df["title_merge"].str.replace(key, value)
    na_df["title_merge"] = na_df["title_merge"].str.replace(key, value)

ą a
ć c
ę ę
ł x
ń n
ó o
ś s
ź z
ż z


check for improvements

In [177]:
eu_check_df = pd.merge(eu_df,imdb_df, how="left", left_on=["title_merge", "year"], right_on=["original_title_merge", "year"])

In [178]:
eu_check_mask = eu_check_df["tconst"].isnull()
eu_check_df[eu_check_mask]

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
21,15 Minutes (Fifteen Minutes),US,2001,2600419,2565397,15 minutes (fifteen minutes),,,,,...,,,,,,,,,,
40,23,DE,1998,701787,693358,23,,,,,...,,,,,,,,,,
46,28 Days Later,GB,2002,4252690,4066710,28 days later,,,,,...,,,,,,,,,,
58,"4 luni, 3 saptamani si 2 zile",RO,2007,1090696,736957,"4 luni, 3 saptamani si 2 zile",,,,,...,,,,,,,,,,
65,5X2 cinq fois deux,FR,2004,1150178,814942,5x2 cinq fois deux,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4514,[REC]³ Génesis,ES,2011,828887,774431,[rec]³ génesis,,,,,...,,,,,,,,,,
4515,[Rec] 2,ES,2009,1387671,1013091,[rec] 2,,,,,...,,,,,,,,,,
4540,Účastníci zájezdu,CZ,2006,871439,830242,účastníci zájezdu,,,,,...,,,,,,,,,,
4542,Świadectwo,PL,2008,1039901,1034911,swiadectwo,,,,,...,,,,,,,,,,


In [179]:
imdb_df[imdb_df["tconst"] == "tt1869689"]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge
