# Combine only most critical steps of previous merging book into one.

In [1]:
# import all packages
import pandas as pd
import numpy as np
import psycopg2 as psycopg2
import sql_functions as sqlf
import string as string
from unidecode import unidecode
from num2words import num2words

### Import data

In [2]:
schema = "capstone_24_4_group1"
schema

'capstone_24_4_group1'

In [3]:
imdb_query = f'''   SELECT *
                    FROM {schema}."IMDB_data"
                    '''
eu_query = f'''   SELECT *
                    FROM {schema}."movie_data_EU"
                    '''
na_query = f'''   SELECT *
                    FROM {schema}."movie_data_NA"
                    '''
aka_query = f'''   SELECT *
                    FROM {schema}."imdb_akas_data"
                    '''

In [4]:
imdb_df = sqlf.get_dataframe(imdb_query)
eu_df = sqlf.get_dataframe(eu_query)
na_df = sqlf.get_dataframe(na_query)
aka_df = sqlf.get_dataframe(aka_query)

In [5]:
na_df.columns = ['title', 'release_date', 'distributor', 'gross_sales', 'tickets_sold', 'year']

In [6]:
# filter for years
eu_df = eu_df[eu_df["year"] >= 1998].reset_index(drop=True)
na_df = na_df[na_df["year"] >= 1998].reset_index(drop=True)

In [7]:
# Lower and change non-latin characters
imdb_df["original_title_merge"] = imdb_df["original_title"].str.lower().apply(unidecode)
eu_df["title_merge"] = eu_df["title"].str.lower().apply(unidecode)
na_df["title_merge"] = na_df["title"].str.lower().apply(unidecode)
imdb_df["primary_title_merge"] = imdb_df["primary_title"].str.lower().apply(unidecode)

In [8]:
# remove punctuations
imdb_df["original_title_merge"] = imdb_df["original_title_merge"].str.translate(str.maketrans("","",string.punctuation))
imdb_df["primary_title_merge"] = imdb_df["primary_title_merge"].str.translate(str.maketrans("","",string.punctuation))
eu_df["title_merge"] = eu_df["title_merge"].str.translate(str.maketrans("","",string.punctuation))
na_df["title_merge"] = na_df["title_merge"].str.translate(str.maketrans("","",string.punctuation))

adjust AKA_df

In [9]:
# lower, non-latin and special characters for aka_df
for column in aka_df.iloc[:,1:]:
    aka_df.loc[:,column] = aka_df[column].str.lower()
    aka_df.loc[:,column] = aka_df[column].astype(str).apply(unidecode)
    aka_df.loc[:,column] = aka_df.loc[:,column].str.translate(str.maketrans("","",string.punctuation))

In [10]:
# Merge aka with imdb
imdb_aka_df = pd.merge(imdb_df, aka_df, how="left", on="tconst")

### Remove double Spaces

In [11]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    display(imdb_aka_df[column].str.contains("  ").sum())
display(eu_df[eu_df["title_merge"].str.contains("  ")].shape)
na_df[na_df["title_merge"].str.contains("  ")].shape

4809

4517

741

7110

338

1452

1489

3479

312

474

(156, 6)

(53, 7)

In [12]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    imdb_aka_df.loc[:,column] = imdb_aka_df[column].str.replace("  ", " ")
    imdb_aka_df.loc[:,column] = imdb_aka_df[column].str.replace("  ", " ")
eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace("  ", " ")
na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace("  ", " ")

In [13]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    display(imdb_aka_df[column].str.contains("  ").sum())
display(eu_df[eu_df["title_merge"].str.contains("  ")].shape)
na_df[na_df["title_merge"].str.contains("  ")].shape

0

0

0

0

0

0

0

0

0

0

(2, 6)

(0, 7)

### Remove Roman and Latin Numbers

In [14]:
testing_dict = {r'(^ii | ii | ii$)': "two",
                r'(^iii | iii | iii$)': "three",
                r'(^iv | iv | iv$)': "four",
                r'(^v | v | v$)': "five",
                r'(^vi | vi | vi$)': "six",
                r'(^vii | vii | vii$)': "seven",
                r'(^viii | viii | viii$)': "eight",
                r'(^ix | ix | ix$)': "nine",
                r'(^x | x | x$)': "ten",}

In [15]:
imdb_aka_df[imdb_aka_df["original_title"].str.contains("Jurassic Park")]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
890,tt0119567,The Lost World: Jurassic Park,The Lost World: Jurassic Park,1997,129,449430.0,6.6,3.0,Action,Adventure,...,the lost world jurassic park,vergessene welt jurassic park,el mundo perdido jurassic park,le monde perdu,the lost world jurassic park,il mondo perduto jurassic park,the lost world jurassic park,park jurajski ii,jurassic park 2 kayip dunya,jurassic park ii
3807,tt0163025,Jurassic Park III,Jurassic Park III,2001,92,344970.0,5.9,3.0,Action,Adventure,...,le parc jurassique iii,jurassic park iii,jurassic park iii parque jurasico iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park 3,jurassic park 3
141839,tt4130956,Jurassic Park: Operation Rebirth,Jurassic Park: Operation Rebirth,2014,70,106.0,6.7,1.0,Thriller,,...,jurassic park operation rebirth,,,,jurassic park operation rebirth,,,,,


In [16]:
list_titles = ['original_title_merge', 'primary_title_merge', 'CA','DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', 'ALTER']

for title in list_titles:
    for key, value in testing_dict.items():
        imdb_aka_df.loc[:,title] = imdb_aka_df[title].str.replace(key, value, regex=True)

for key, value in testing_dict.items():
        eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace(key, value, regex=True)

for key, value in testing_dict.items():
        na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace(key, value, regex=True)

In [17]:
imdb_aka_df[imdb_aka_df["original_title"].str.contains("Jurassic Park")]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
890,tt0119567,The Lost World: Jurassic Park,The Lost World: Jurassic Park,1997,129,449430.0,6.6,3.0,Action,Adventure,...,the lost world jurassic park,vergessene welt jurassic park,el mundo perdido jurassic park,le monde perdu,the lost world jurassic park,il mondo perduto jurassic park,the lost world jurassic park,park jurajskitwo,jurassic park 2 kayip dunya,jurassic parktwo
3807,tt0163025,Jurassic Park III,Jurassic Park III,2001,92,344970.0,5.9,3.0,Action,Adventure,...,le parc jurassiquethree,jurassic parkthree,jurassic parkthreeparque jurasicothree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic park 3,jurassic park 3
141839,tt4130956,Jurassic Park: Operation Rebirth,Jurassic Park: Operation Rebirth,2014,70,106.0,6.7,1.0,Thriller,,...,jurassic park operation rebirth,,,,jurassic park operation rebirth,,,,,


In [18]:
list_titles = ['original_title_merge', 'primary_title_merge','CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', 'ALTER']

for title in list_titles:
    imdb_aka_df.loc[:,title] = imdb_aka_df[title].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

Drop Oscar "Movies"

In [19]:
na_df.drop(na_df[na_df["distributor"].str.contains("Shorts")].index, inplace=True)

### THE FUNCTION

In [20]:
def ultimate_merge_func(data, imdb_base, number_of_columns=6, short=False):
    '''
    Merges our Tickets Sold Dataframes First on each title for year and year +/- 1

    Input:
        ticket_data ... either eu or na depending on what we are testing
        base_data = imdb_df
        number_of_columns = 6 ... How many columns does the df with the ticket data have
        short = False ... if true, shorten the titles to merge to ... 25?

    Output:
        Returns: Dataframe with all matches from both columns
        Prints: Unmatched rows
    '''
    ticket_data = data.copy()
    base_data = imdb_base.copy()

    list_titles = ['original_title_merge', 'primary_title_merge','CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', "ALTER"]
    counter = 0

    # remove all spaces from all titles
    #imdb
    for column in base_data.iloc[:,-12:-2]:
        base_data.loc[:,column] = base_data[column].str.replace(" ", "")
        base_data.loc[:,column] = base_data[column].str.replace(" ", "")
    # ticket data
    ticket_data.loc[:,"title_merge"] = ticket_data["title_merge"].str.replace(" ", "")

    # add two year plus and minus as extra rows
    base_data["year_plus"] = base_data["year"] + 1
    base_data["year_minus"] = base_data["year"] + -1

    list_years = ["year_minus", "year", "year_plus"]

    if short == True:
        for title in list_titles:
            base_data.loc[:,title] = base_data[title].str[:18]
        ticket_data.loc[:,"title_merge"] = ticket_data["title_merge"].str[:18]

    for title in list_titles:
        if counter == 0:
            check_df = pd.merge(ticket_data,base_data, how="left", left_on=["title_merge", "year"], right_on=[title, "year"])

            
            # set-up third df for further calculations
            check_mask = check_df["tconst"].isnull()
            third_merge_df = check_df[check_mask].copy()

            # drop columns from first merge
            third_merge_df.drop(columns = third_merge_df.iloc[:,number_of_columns:], inplace=True)
            
            # 3_plus merge (inner)
            third_merge_plus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, "year_plus"])
            # drop columns from first merge table that matched on 3_plus merge (inner)
            third_merge_plus_mask = check_df["title"].isin(list(third_merge_plus_df["title"]))
            check_df.drop(check_df[third_merge_plus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_plus_df])
            check_df.reset_index(drop=True, inplace=True)
            check_df.drop(columns = "year_x", inplace=True)

            # 3_minus merge (inner)
            third_merge_minus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, "year_minus"])
            # drop columns from first merge table that matched on 3_minus merge (inner)
            third_merge_minus_mask = check_df["title"].isin(list(third_merge_minus_df["title"]))
            check_df.drop(check_df[third_merge_minus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_minus_df])
            check_df.reset_index(drop=True, inplace=True)
            check_df.drop(columns = "year_x", inplace=True)

            counter += 1

        for year in list_years:
            # set-up third df for further calculations
            check_mask = check_df["tconst"].isnull()
            third_merge_df = check_df[check_mask].copy()

            # drop columns from first merge
            third_merge_df.drop(columns = third_merge_df.iloc[:,number_of_columns:], inplace=True)
            
            # merge again
            third_merge_plus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, year])
            # drop columns from first merge table that matched on 3_plus merge (inner)
            third_merge_plus_mask = check_df["title"].isin(list(third_merge_plus_df["title"]))
            check_df.drop(check_df[third_merge_plus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_plus_df])
            check_df.reset_index(drop=True, inplace=True)
        counter += 1

    # drop new year columns from final table
    check_df.drop(columns = check_df.iloc[:,-4:], inplace=True)

    # show unmatched rows
    check_mask = check_df["tconst"].isnull()
    display(check_df[check_mask])
    display(counter)
    
    return check_df

### Check if it works as expected

In [21]:
eu_check = ultimate_merge_func(eu_df, imdb_aka_df)

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
21,15 Minutes (Fifteen Minutes),US,2001.0,2600419,2565397,onefiveminutesfifteenminutes,,,,,...,,,,,,,,,,
65,5X2 cinq fois deux,FR,2004.0,1150178,814942,fivextwocinqfoisdeux,,,,,...,,,,,,,,,,
293,Arthur et la guerre des deux mondes,FR,2010.0,3838378,3363498,arthuretlaguerredesdeuxmondes,,,,,...,,,,,,,,,,
296,Artificial Intelligence: AI,US,2001.0,8073605,8041431,artificialintelligenceai,,,,,...,,,,,,,,,,
320,Atatürk 1881 - 1919,TR,2023.0,1732649,1732649,ataturkoneeighteightoneonenineonenine,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3997,Tinker Bell and the Pirate Fairy,US,2014.0,5489166,5472379,tinkerbellandthepiratefairy,,,,,...,,,,,,,,,,
4193,Wallace & Gromit in The Curse of the Were-Rabbit,"GBinc, US",2005.0,14014825,13251997,wallacegromitinthecurseofthewererabbit,,,,,...,,,,,,,,,,
4204,Warum Männer nicht zuhören und Frauen schlecht...,DE,2007.0,1452342,1068475,warummannernichtzuhorenundfrauenschlechtereinp...,,,,,...,,,,,,,,,,
4322,Zeny v behu,CZ,2019.0,1705959,1675569,zenyfivebehu,,,,,...,,,,,,,,,,


13

In [22]:
na_check = ultimate_merge_func(na_df, imdb_aka_df, number_of_columns=7, short=True)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
31,21 and Over,2013-03-01,Relativity,25682380,3158964,2013.0,twooneandover,,,,...,,,,,,,,,,
54,63 Up,2019-11-27,BritBox,183940,20037,2019.0,sixthreeup,,,,...,,,,,,,,,,
69,A Common Thread,2002-11-29,Odeon Films,5058187,838836,2002.0,acommonthread,,,,...,,,,,,,,,,
107,A Rescue of Little Eggs,2021-08-27,Lionsgate,927154,91166,2021.0,arescueoflittleegg,,,,...,,,,,,,,,,
114,A Stir of Echoes,1999-09-10,Artisan,21133087,4160056,1999.0,astirofechoes,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3970,Y Tu Mama Tambien (And Your…,2002-03-15,IFC Films,13649881,2349377,2002.0,ytumamatambienandy,,,,...,,,,,,,,,,
3983,You're Next,2013-08-23,Lionsgate,18494006,2274785,2013.0,yourenext,,,,...,,,,,,,,,,
3989,"Yours, Mine and Ours",2005-11-23,Paramount Pictures,50733384,7914724,2005.0,yoursmineandours,,,,...,,,,,,,,,,
3991,Yu-Gi-Oh,2004-08-13,Warner Bros.,19762690,3182397,2004.0,yugioh,,,,...,,,,,,,,,,


13

### Seperate NA into upper and lower

In [23]:
na_upper_df = na_df.sort_values(by="release_date").iloc[:2250].copy()
na_upper_df

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge
1224,Firestorm,1998-01-09,20th Century Fox,8123860,1732166,1998,firestorm
1470,Hard Rain,1998-01-16,Paramount Pictures,19870567,4236794,1998,hard rain
1164,Fallen,1998-01-16,Warner Bros.,25310938,5396788,1998,fallen
1441,Half Baked,1998-01-16,Universal,17394881,3708929,1998,half baked
3086,Star Kid,1998-01-16,Trimark,7015240,1495786,1998,star kid
...,...,...,...,...,...,...,...
4041,The Town,2010-09-17,Warner Bros.,92173235,11682286,2010,the town
1061,Easy A,2010-09-17,Sony Pictures,58401464,7401960,2010,easy a
4510,You Will Meet a Tall Dark S…,2010-09-22,Sony Pictures Cla…,3229586,409326,2010,you will meet a tall dark s
4505,You Again,2010-09-24,Walt Disney,25702053,3257547,2010,you again


In [24]:
na_lower_df = na_df.sort_values(by="release_date").iloc[2250:].copy()
na_lower_df

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge
1961,Legend of the Guardians: Th…,2010-09-24,Warner Bros.,55549823,7040535,2010,legend of the guardians th
4342,Waiting for Superman,2010-09-24,Paramount Vantage,6417135,813325,2010,waiting for superman
675,Case 39,2010-10-01,Paramount Vantage,13261851,1680843,2010,case threenine
2769,Robot,2010-10-01,B4U Movies,2276427,288520,2010,robot
1971,Let Me In,2010-10-01,Overture Films,12134935,1538015,2010,let me in
...,...,...,...,...,...,...,...
307,Anyone But You,2023-12-22,Sony Pictures,24837385,2304025,2023,anyone but you
2196,Migration,2023-12-22,Universal,54103955,5018919,2023,migration
3379,The Color Purple,2023-12-25,Warner Bros.,44047642,4086052,2023,the color purple
1197,Ferrari,2023-12-25,Neon,10778480,999858,2023,ferrari


### Work on lower half

In [25]:
test = ultimate_merge_func(na_lower_df, imdb_aka_df, number_of_columns=7, short=True)
# ultimate_merge_func(na_upper_df, imdb_aka_df, number_of_columns=7, short=True)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
58,Gnomeo and Juliet,2011-02-11,Walt Disney,99967670,12606263,2011.0,gnomeoandjuliet,,,,...,,,,,,,,,,
86,Born to be Wild 3D,2011-04-08,Warner Bros.,32387013,4075936,2011.0,borntobewildthreed,,,,...,,,,,,,,,,
88,Atlas Shrugged: Part 1,2011-04-15,Rocky Mountain Pi…,4752353,599287,2011.0,atlasshruggedparto,,,,...,,,,,,,,,,
124,Cowboys and Aliens,2011-07-29,Universal,100368560,12656817,2011.0,cowboysandaliens,,,,...,,,,,,,,,,
135,Spy Kids: All the Time in t…,2011-08-19,Weinstein/Dimension,38536376,4859568,2011.0,spykidsallthetimei,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1800,The Chosen Season 3: Episod…,2022-11-18,Fathom Events,14612089,1387662,2022.0,thechosenseasonthr,,,,...,,,,,,,,,,
1814,Met Opera: The Hours,2022-12-10,Fathom Events,791374,75154,2022.0,metoperathehours,,,,...,,,,,,,,,,
1832,The Chosen Season 3 Finale,2023-02-02,Fathom Events,5525069,512529,2023.0,thechosenseasonthr,,,,...,,,,,,,,,,
1865,Big George Foreman: The Mir…,2023-04-28,Sony Pictures,5426772,503411,2023.0,biggeorgeforemanth,,,,...,,,,,,,,,,


13

In [26]:
test[test["tconst"].isnull()].head(10)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
58,Gnomeo and Juliet,2011-02-11,Walt Disney,99967670,12606263,2011.0,gnomeoandjuliet,,,,...,,,,,,,,,,
86,Born to be Wild 3D,2011-04-08,Warner Bros.,32387013,4075936,2011.0,borntobewildthreed,,,,...,,,,,,,,,,
88,Atlas Shrugged: Part 1,2011-04-15,Rocky Mountain Pi…,4752353,599287,2011.0,atlasshruggedparto,,,,...,,,,,,,,,,
124,Cowboys and Aliens,2011-07-29,Universal,100368560,12656817,2011.0,cowboysandaliens,,,,...,,,,,,,,,,
135,Spy Kids: All the Time in t…,2011-08-19,Weinstein/Dimension,38536376,4859568,2011.0,spykidsallthetimei,,,,...,,,,,,,,,,
159,Flying Monsters 3D,2011-10-06,National Geograph…,4176870,524732,2011.0,flyingmonstersthre,,,,...,,,,,,,,,,
195,George Balanchine's The Nut…,2011-12-13,NCM Fathom,2119994,267338,2011.0,georgebalanchinest,,,,...,,,,,,,,,,
206,Extremely Loud and Incredib…,2011-12-25,Warner Bros.,31495464,3956716,2011.0,extremelyloudandin,,,,...,,,,,,,,,,
221,The Secret World of Arrietty,2012-02-17,Walt Disney,19192510,2411119,2012.0,thesecretworldofar,,,,...,,,,,,,,,,
225,Tyler Perry's Good Deeds,2012-02-24,Lionsgate,35025791,4400225,2012.0,tylerperrysgooddee,,,,...,,,,,,,,,,


In [27]:
pd.set_option('display.max_columns', 40)

In [28]:
imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt4495098"]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
145614,tt4495098,Gran Turismo,Gran Turismo,2023,134,109255.0,7.1,3.0,Action,Adventure,Drama,David Harbour,Orlando Bloom,Archie Madekwe,Takehiro Hira,Darren Barnet,1.0,Neill Blomkamp,,,3.0,Jason Hall,Zach Baylin,Alex Tse,0,gran turismo,gran turismo,gran turismo,gran turismo,gran turismo,gran turismo,gran turismo,gran turismo la storia di un sogno impossibile,,,gran turismo,


In [29]:
basics_df[basics_df["tconst"] == "tt26998517"]

NameError: name 'basics_df' is not defined

### Change all na_lower to correct name

### First 10

In [44]:
na_lower_df.loc[na_lower_df["title"] == "Gnomeo and Juliet", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0377981", "primary_title_merge"].values[0]

In [54]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Born to be Wild 3D"].index, inplace=True)

In [60]:
na_lower_df.loc[na_lower_df["title"] == "Atlas Shrugged: Part 1", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0480239", "primary_title_merge"].values[0]

In [62]:
na_lower_df.loc[na_lower_df["title"] == "Cowboys and Aliens", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0409847", "primary_title_merge"].values[0]

In [66]:
na_lower_df.loc[na_lower_df["title"] == "Spy Kids: All the Time in t…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1517489", "primary_title_merge"].values[0]

In [68]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Flying Monsters 3D"].index, inplace=True)

In [71]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "George Balanchine's The Nut…"].index, inplace=True)

In [73]:
na_lower_df.loc[na_lower_df["title"] == "Extremely Loud and Incredib…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0477302", "primary_title_merge"].values[0]

In [76]:
na_lower_df.loc[na_lower_df["title"] == "The Secret World of Arrietty", "year"] = 2010

In [78]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry's Good Deeds", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1885265", "primary_title_merge"].values[0]

### Second 10

In [82]:
na_lower_df.loc[na_lower_df["title"] == "Dr. Seuss' The Lorax", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1482459", "primary_title_merge"].values[0]

In [84]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "To the Arctic 3D"].index, inplace=True)

In [88]:
na_lower_df.loc[na_lower_df["title"] == "For Greater Glory", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1566501", "primary_title_merge"].values[0]

In [90]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry's Madea's Witne…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2215285", "primary_title_merge"].values[0]

In [92]:
na_lower_df.loc[na_lower_df["title"] == "Celeste and Jesse Forever", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1405365", "primary_title_merge"].values[0]

In [94]:
na_lower_df.loc[na_lower_df["title"] == "Robot and Frank", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1990314", "primary_title_merge"].values[0]

In [96]:
na_lower_df.loc[na_lower_df["title"] == "Atlas Shrugged: Part II", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1985017", "primary_title_merge"].values[0]

In [97]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Met: Live in HD - Aida"].index, inplace=True)

In [99]:
na_lower_df.loc[na_lower_df["title"] == "21 and Over", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1711425", "primary_title_merge"].values[0]

In [101]:
na_lower_df.loc[na_lower_df["title"] == "The Hangover 3", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1951261", "primary_title_merge"].values[0]

### Third 10

In [105]:
na_lower_df.loc[na_lower_df["title"] == "Fast and Furious 6", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1905041", "primary_title_merge"].values[0]

In [107]:
na_lower_df.loc[na_lower_df["title"] == "Disney Planes", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1691917", "primary_title_merge"].values[0]

In [109]:
na_lower_df.loc[na_lower_df["title"] == "Lee Daniels' The Butler", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1327773", "primary_title_merge"].values[0]

In [112]:
na_lower_df.loc[na_lower_df["title"] == "You're Next", "year"] = 2011

In [115]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Jerusalem"].index, inplace=True)

In [118]:
na_lower_df.loc[na_lower_df["title"] == "Battle of the Year 3D", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1532958", "primary_title_merge"].values[0]

In [120]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Journey to the South Pacific"].index, inplace=True)

In [122]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry's The Single Mo…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2465140", "primary_title_merge"].values[0]

In [124]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Island of Lemurs: Madagascar"].index, inplace=True)

In [126]:
na_lower_df.loc[na_lower_df["title"] == "America: Imagine a World Wi…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2785390", "primary_title_merge"].values[0]

### Fourth 10 Films

In [130]:
na_lower_df.loc[na_lower_df["title"] == "Planes: Fire and Rescue", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2980706", "primary_title_merge"].values[0]

In [132]:
na_lower_df.loc[na_lower_df["title"] == "The Fluffy Movie", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt3532608", "primary_title_merge"].values[0]

In [134]:
na_lower_df.loc[na_lower_df["title"] == "The Divergent Serires: Insu…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2908446", "primary_title_merge"].values[0]

In [136]:
na_lower_df.loc[na_lower_df["title"] == "Shaun the Sheep", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2872750", "primary_title_merge"].values[0]

In [138]:
na_lower_df.loc[na_lower_df["title"] == "The Green Inferno", "year"] = 2013

In [140]:
na_lower_df.loc[na_lower_df["title"] == "Star Wars Ep. VII: The Forc…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2488496", "primary_title_merge"].values[0]

In [142]:
na_lower_df.loc[na_lower_df["title"] == "Ratchet and Clank", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2865120", "primary_title_merge"].values[0]

In [144]:
na_lower_df.loc[na_lower_df["title"] == "The Conjuring 2: The Enfiel…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt3065204", "primary_title_merge"].values[0]

In [147]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Queen of Katwe"].index, inplace=True)

In [149]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry’s Boo! A Madea …", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt5325452", "primary_title_merge"].values[0]

### Fifth 10 Films

In [152]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Mayweather vs. McGregor"].index, inplace=True)

In [154]:
na_lower_df.loc[na_lower_df["title"] == "Victoria and Abdul", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt5816682", "primary_title_merge"].values[0]

In [157]:
na_lower_df.loc[na_lower_df["title"] == "Star Wars Ep. VIII: The Las…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2527336", "primary_title_merge"].values[0]

In [161]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Pandas"].index, inplace=True)

In [163]:
na_lower_df.loc[na_lower_df["title"] == "The Old Man and the Gun", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2837574", "primary_title_merge"].values[0]

In [165]:
na_lower_df.loc[na_lower_df["title"] == "The Upside", "year"] = 2017

In [167]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry’s A Madea Famil…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt7054636", "primary_title_merge"].values[0]

In [169]:
na_lower_df.loc[na_lower_df["title"] == "Pain & Glory", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt8291806", "primary_title_merge"].values[0]

In [171]:
na_lower_df.loc[na_lower_df["title"] == "The Current War: Director’s…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2140507", "primary_title_merge"].values[0]

In [174]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "63 Up"].index, inplace=True)

### Sixth 10 Films

In [178]:
na_lower_df.loc[na_lower_df["title"] == "The Current War: Director’s…", "year"] = 2017

In [180]:
na_lower_df.loc[na_lower_df["title"] == "IP Man: The Finale", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2076298", "primary_title_merge"].values[0]

In [182]:
na_lower_df.loc[na_lower_df["title"] == "Thank You For Everything", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11349958", "primary_title_merge"].values[0]

In [183]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Met Opera: Wozzeck"].index, inplace=True)

In [185]:
na_lower_df.loc[na_lower_df["title"] == "The Doors: Break on Thru — …", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt7903550", "primary_title_merge"].values[0]

In [187]:
na_lower_df.loc[na_lower_df["title"] == "The Times of Bill Cunningham", "year"] = 2018

In [189]:
na_lower_df.loc[na_lower_df["title"] == "Burden", "year"] = 2018

In [190]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Met Opera — Agrippina"].index, inplace=True)

In [192]:
na_lower_df.loc[na_lower_df["title"] == "Followed", "year"] = 2018

In [194]:
na_lower_df.loc[na_lower_df["title"] == "Murder in the Woods", "year"] = 2017

### Seventh 10 Films

In [200]:
na_lower_df.loc[na_lower_df["title"] == "The Doors: Break on Thru — …", "year"] = 2018

In [202]:
na_lower_df.loc[na_lower_df["title"] == "Train to Busan Presents: Pe…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt8850222", "primary_title_merge"].values[0]

In [204]:
na_lower_df.loc[na_lower_df["title"] == "Legend of Deification", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11177804", "primary_title_merge"].values[0]

In [206]:
na_lower_df.loc[na_lower_df["title"] == "True to the Game 2: Gena’s …", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11237714", "primary_title_merge"].values[0]

In [208]:
na_lower_df.loc[na_lower_df["title"] == "Missão: Vingança", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt10310140", "primary_title_merge"].values[0]

In [211]:
na_lower_df.loc[na_lower_df["title"] == "Always & Forever", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt7544954", "primary_title_merge"].values[0]

In [213]:
na_lower_df.loc[na_lower_df["title"] == "American Skin", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt10195452", "primary_title_merge"].values[0]

In [214]:
na_lower_df.loc[na_lower_df["title"] == "American Skin", "year"] = 2019

In [216]:
na_lower_df.loc[na_lower_df["title"] == "Our Friend", "year"] = 2019

In [218]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Earwig and the Witch"].index, inplace=True)

In [220]:
na_lower_df.loc[na_lower_df["title"] == "City of Lies", "year"] = 2018

### Eight 10 Films

In [224]:
na_lower_df.loc[na_lower_df["title"] == "Demon Slayer The Movie: Mug…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11032374", "primary_title_merge"].values[0]

In [227]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "La piscine"].index, inplace=True)

In [229]:
na_lower_df.loc[na_lower_df["title"] == "Profile", "year"] = 2018

In [231]:
na_lower_df.loc[na_lower_df["title"] == "A Rescue of Little Eggs", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt8049994", "primary_title_merge"].values[0]

In [232]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Met Opera: Fire Shut Up in …"].index, inplace=True)

In [234]:
na_lower_df.loc[na_lower_df["title"] == "American Underdog: The Kurt…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11729298", "primary_title_merge"].values[0]

In [235]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Servant"].index, inplace=True)

In [237]:
na_lower_df.loc[na_lower_df["title"] == "Y Como Es El", "year"] = 2020

In [239]:
na_lower_df.loc[na_lower_df["title"] == "Ante Sundaraniki", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt13528564", "primary_title_merge"].values[0]

In [240]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Chosen Season 3: Episod…"].index, inplace=True)

### Last movies

In [243]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Met Opera: The Hours"].index, inplace=True)

In [244]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Chosen Season 3 Finale"].index, inplace=True)

In [246]:
na_lower_df.loc[na_lower_df["title"] == "Big George Foreman: The Mir…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt12226632", "primary_title_merge"].values[0]

In [248]:
na_lower_df.loc[na_lower_df["title"] == "Gran Turismo: Based on a Tr…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt4495098", "primary_title_merge"].values[0]

### Upload lower NAs to SQL

In [251]:
table_name = 'na_merged_lower'
engine = sqlf.get_engine()
engine

Engine(postgresql://user:***@host/database)

In [252]:
schema

'capstone_24_4_group1'

In [253]:
# # Write records stored in a dataframe to SQL database
# if engine != None:
#     try:
#         test.to_sql(table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # your class schema
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None
# else:
#     print("shit")

The na_merged_lower table was imported successfully.


In [258]:
imdb_aka_df.head()

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
0,tt0013274,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,2021,94,73.0,6.7,1.0,Documentary,,,,,,,,2.0,Nikolai Izvolov,Dziga Vertov,,,,,,0,istoriya grazhdanskoy voyny,istoriya grazhdanskoy voyny,,,,histoire de la guerre civile,,,,,,
1,tt0015414,La tierra de los toros,La tierra de los toros,2000,60,17.0,5.4,,,,,,,,,,1.0,Musidora,,,,,,,0,la tierra de los toros,la tierra de los toros,,,la tierra de los toros,la terre des taureaux,,,,,,
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944.0,6.4,3.0,Comedy,Fantasy,Romance,Meg Ryan,Hugh Jackman,Liev Schreiber,Breckin Meyer,Natasha Lyonne,1.0,James Mangold,,,2.0,Steven Rogers,James Mangold,,0,kate leopold,kate leopold,kate et leopold,kate und leopold,la kate i en leopold,kate et leopold,kate leopold,kate and leopold,,kate i leopold,buyulu cift,
3,tt0062336,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,2020,70,190.0,6.5,1.0,Drama,,,Rubén Sotoconil,Claudia Paz,Luis Alarcón,Shenda Román,Luis Vilches,2.0,Raúl Ruiz,Valeria Sarmiento,,2.0,Raúl Ruiz,Omar Saavedra Santis,,0,el tango del viudo y su espejo deformante,the tango of the widower and its distorting mi...,,,,el tango del viudo y su espejo deformante,the tango of the widower and its distorting mi...,,,,,
4,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122,8143.0,6.7,1.0,Drama,,,John Huston,Oja Kodar,Peter Bogdanovich,Susan Strasberg,Norman Foster,1.0,Orson Welles,,,2.0,Orson Welles,Oja Kodar,,0,the other side of the wind,the other side of the wind,the other side of the wind,the other side of the wind,al otro lado del viento,de lautre cote du vent,the other side of the wind,laltra faccia del vento,,druga strona wiatru,,


# Import all Ready Data and combine for final Set to work and scrape with

In [30]:
imdb_df.head(2)

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge
0,tt0013274,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,2021,94,73.0,6.7,1.0,Documentary,,,,,,,,2.0,Nikolai Izvolov,Dziga Vertov,,,,,,0,istoriya grazhdanskoy voyny,istoriya grazhdanskoy voyny
1,tt0015414,La tierra de los toros,La tierra de los toros,2000,60,17.0,5.4,,,,,,,,,,1.0,Musidora,,,,,,,0,la tierra de los toros,la tierra de los toros


In [31]:
eu_df.head(2)

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge
0,(500) Days of Summer,US,2009,1713086,1684771,fivezerozero days of summer
1,(Nie)znajomi,PL,2019,685075,684833,nieznajomi


In [32]:
na_df.head(2)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge
0,(500) Days of Summer,2009-08-07,Fox Searchlight,32425665,4323422,2009,fivezerozero days of summer
1,10 Cloverfield Lane,2016-03-11,Paramount Pictures,72082999,8333294,2016,onezero cloverfield lane


import merged data sets

In [57]:
eu_merged = f'''   SELECT *
                    FROM {schema}."eu_merged"
                    '''
na_merged = f'''   SELECT *
                    FROM {schema}."na_merged"
                    '''

In [58]:
eu_merged = sqlf.get_dataframe(eu_merged)
na_merged = sqlf.get_dataframe(na_merged)

In [59]:
eu_merged.columns

Index(['title', 'producing_country', 'year', 'tickets_sold_since_1996',
       'tickets_sold', 'title_merge', 'tconst', 'primary_title',
       'original_title', 'runtime', 'num_votes', 'average_rating',
       'genres_count', 'genre', 'genre2', 'genre3', 'acting', 'acting2',
       'acting3', 'acting4', 'acting5', 'directors_count', 'director_name',
       'director2_name', 'director3_name', 'writers_count', 'writer_name',
       'writer2_name', 'writer3_name', 'is_adult', 'original_title_merge',
       'primary_title_merge', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL',
       'TR', 'ALTER'],
      dtype='object')

In [60]:
eu_merged.rename(columns={"producing_country": "country" , 
                          "tickets_sold_since_1996": "EU_since_1996", 
                          "tickets_sold": "EU_tickets_sold"},
                          inplace=True)

In [61]:
eu_merged.head(2)

Unnamed: 0,title,country,year,EU_since_1996,EU_tickets_sold,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,...,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
0,(500) Days of Summer,US,2009.0,1713086,1684771,fivezerozerodaysofsummer,tt1022603,500 Days of Summer,(500) Days of Summer,95,569177.0,7.7,3.0,Comedy,Drama,Romance,Zooey Deschanel,Joseph Gordon-Levitt,Geoffrey Arend,Chloë Grace Moretz,...,Marc Webb,,,2.0,Scott Neustadter,Michael H. Weber,,0.0,fivezerozerodaysofsummer,fivezerozerodaysofsummer,fivezerozerodaysofsummer,fivezerozerodaysofsummer,fivezerozerodiasjuntos,fivezerozerojoursensemble,fivezerozerodaysofsummer,fivezerozerogiorniinsieme,fivezerozerodaysofsummer,fivezerozerodnimilosci,askin fivezerozero gunu,
1,(Nie)znajomi,PL,2019.0,685075,684833,nieznajomi,tt10518924,(Nie)znajomi,(Nie)znajomi,103,1085.0,6.8,2.0,Comedy,Drama,,Maja Ostaszewska,Lukasz Simlat,Aleksandra Domanska,Michal Zurawski,...,Tadeusz Sliwa,,,10.0,Filippo Bologna,Paolo Costella,,0.0,nieznajomi,nieznajomi,,nieznajomi,,,nieznajomi,,,nieznajomi,,


In [62]:
na_merged.rename(columns={"gross_sales": "NA_gross_sales", 
                          "tickets_sold": "NA_tickets_sold"},
                          inplace=True)

In [63]:
na_merged.head(2)

Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,...,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
0,"10,000 B.C.",2008-03-07,Warner Bros.,94784201,13201142,2008.0,onezerozerozerozer,tt0443649,"10,000 BC","10,000 BC",109,136209.0,5.1,3.0,Action,Adventure,Drama,Camilla Belle,Steven Strait,Marco Khan,...,Roland Emmerich,,,2.0,Roland Emmerich,Harald Kloser,,0.0,onezerozerozerozer,onezerozerozerozer,onezerozerozerozer,onezerozerozerozer,onezerozerozerozer,onezerozerozerozer,onezerozerozerozer,onezerozerozerozer,onezerozerozerozer,onezerozerozerozer,mo onezerozerozero,
1,102 Dalmatians,2000-11-22,Walt Disney,66941559,12343421,2000.0,onezerotwodalmatia,tt0211181,102 Dalmatians,102 Dalmatians,100,39983.0,4.8,3.0,Adventure,Comedy,Family,Glenn Close,Gérard Depardieu,Ioan Gruffudd,...,Kevin Lima,,,5.0,Dodie Smith,Kristen Buckley,,0.0,onezerotwodalmatia,onezerotwodalmatia,onezerotwodalmatia,onezerotwodalmatin,onezerotwodalmatas,onezerotwodalmatie,onehundredandtwoda,lacaricadeionezero,onezerotwoechtedal,onezerotwodalmatyn,onezerotwo dalmacy,


### merging

In [64]:
# add EU
imdb_tickets_df = pd.merge(imdb_df, eu_merged[["country", "EU_since_1996", "EU_tickets_sold", "tconst"]], how="left", on="tconst")

In [65]:
display(imdb_df.shape)
imdb_tickets_df.shape

(188163, 27)

(188163, 30)

In [66]:
print(imdb_tickets_df[imdb_tickets_df["EU_tickets_sold"].notna()].shape)
print(eu_merged.shape)

(4533, 30)
(4533, 42)


### Fixing remaining NorthAmerican problems

In [77]:
pd.set_option("display.max_columns", 50)

In [98]:
na_merged[na_merged["tconst"].duplicated(keep=False)].sort_values(by="tconst")

Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER


In [68]:
# drop two nines
wrong_nine_one_mask = (na_merged["title"] == "9") & (na_merged["primary_title"] == "Nine")
na_merged[wrong_nine_one_mask]

Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,...,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
64,9,2009-09-09,Focus Features,31749894,4233319,2009.0,nine,tt0875034,Nine,Nine,118,45461.0,5.8,3.0,Drama,Musical,Romance,Daniel Day-Lewis,Marion Cotillard,Penélope Cruz,...,Rob Marshall,,,5.0,Michael Tolkin,Anthony Minghella,,0.0,nine,nine,neuf,ninediefrauenmeine,nine,nine,nine,nine,nine,ninedziewiec,nine,


In [69]:
wrong_nine_one_mask_two = (na_merged["title"] == "Nine") & (na_merged["primary_title"] == "9")
na_merged[wrong_nine_one_mask_two]

Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,...,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
2344,Nine,2009-12-25,Weinstein Co.,19676965,2584505,2009.0,nine,tt0472033,9,9,79,149442.0,7.0,3.0,Action,Adventure,Animation,Elijah Wood,Jennifer Connelly,Crispin Glover,...,Shane Acker,,,3.0,Pamela Pettler,Shane Acker,Ben Gluck,0.0,nine,nine,numeronine,nine,numeronine,numeronine,nine,nine,nine,nine,nine,


In [70]:
na_merged.drop(na_merged[wrong_nine_one_mask].index, inplace=True)

In [71]:
na_merged.drop(na_merged[wrong_nine_one_mask_two].index, inplace=True)

  na_merged.drop(na_merged[wrong_nine_one_mask_two].index, inplace=True)


In [72]:
# drop wrong paranormal activity
wrong_paranormal_mask = (na_merged["title"] == "Paranormal Activity") & (na_merged["primary_title"] == "Paranormal Activity 2")
na_merged[wrong_paranormal_mask]

Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,...,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
2474,Paranormal Activity,2009-09-25,Paramount Pictures,107854596,14380613,,paranormalactivity,tt1536044,Paranormal Activity 2,Paranormal Activity 2,91,110867.0,5.7,2.0,Horror,Mystery,,Katie Featherston,Micah Sloat,Molly Ephraim,...,Tod Williams,,,4.0,Michael R. Perry,Christopher Landon,,0.0,paranormalactivity,paranormalactivity,activiteparanormal,paranormalactivity,paranormalactivity,paranormalactivity,paranormalactivity,paranormalactivity,,,paranormal aktivit,


In [73]:
na_merged.drop(na_merged[wrong_paranormal_mask].index, inplace=True)

In [80]:
# drop wrong The Divergent
wrong_divergent_mask = (na_merged["title"] == "The Divergent Series: Alleg…") & (na_merged["primary_title"] == "The Divergent Series: Insurgent")
na_merged[wrong_divergent_mask]

Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
3398,The Divergent Series: Alleg…,2016-03-18,Lionsgate,66184051,7651335,,thedivergentseries,tt2908446,The Divergent Series: Insurgent,Insurgent,119,258469.0,6.2,3.0,Action,Adventure,Sci-Fi,Shailene Woodley,Ansel Elgort,Theo James,Kate Winslet,Jai Courtney,1.0,Robert Schwentke,,,4.0,Brian Duffield,Akiva Goldsman,,0.0,insurgent,thedivergentseries,,,,,,,,,,


In [81]:
na_merged.drop(na_merged[wrong_divergent_mask].index, inplace=True)

Sum over duobled released movies

In [92]:
# sum LOR sales across both release dates
correct_lor_mask = (na_merged["title"] == "The Lord of the Rings: The …") & (na_merged["release_date"] == "2003-12-12")
smaller_lor_mask = (na_merged["title"] == "The Lord of the Rings: The …") & (na_merged["release_date"] == "2003-12-17")
display(na_merged[correct_lor_mask])
na_merged[smaller_lor_mask]

Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
3689,The Lord of the Rings: The …,2003-12-12,New Line,457686954,75399422,2003.0,thelordoftheringst,tt0167260,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,201,2009614.0,9.0,3.0,Action,Adventure,Drama,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,Noel Appleby,1.0,Peter Jackson,,,4.0,J.R.R. Tolkien,Fran Walsh,,0.0,thelordoftheringst,thelordoftheringst,thelordoftheringst,derherrderringedie,elsenordelosanillo,leseigneurdesannea,thereturnoftheking,ilsignoredeglianel,indebanvanderingde,wladcapierscienipo,yuzuklerin efendis,


Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
3690,The Lord of the Rings: The …,2003-12-17,New Line,378203410,62218072,2003.0,thelordoftheringst,tt0167260,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,201,2009614.0,9.0,3.0,Action,Adventure,Drama,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,Noel Appleby,1.0,Peter Jackson,,,4.0,J.R.R. Tolkien,Fran Walsh,,0.0,thelordoftheringst,thelordoftheringst,thelordoftheringst,derherrderringedie,elsenordelosanillo,leseigneurdesannea,thereturnoftheking,ilsignoredeglianel,indebanvanderingde,wladcapierscienipo,yuzuklerin efendis,


In [90]:
na_merged.loc[correct_lor_mask, "NA_gross_sales"] =  (na_merged.loc[smaller_lor_mask, "NA_gross_sales"].values + na_merged.loc[correct_lor_mask, "NA_gross_sales"].values)[0]

In [91]:
na_merged.loc[correct_lor_mask, "NA_tickets_sold"] = (na_merged.loc[smaller_lor_mask, "NA_tickets_sold"].values + na_merged.loc[correct_lor_mask, "NA_tickets_sold"].values)[0]

In [93]:
na_merged.drop(na_merged[smaller_lor_mask].index, inplace=True)

In [95]:
# sum Twilight sales across both release dates
correct_twi_mask = (na_merged["title"] == "The Twilight Saga: Eclipse") & (na_merged["release_date"] == "2010-06-30")
smaller_twi_mask = (na_merged["title"] == "The Twilight Saga: Twilight…") & (na_merged["release_date"] == "2010-06-29")
display(na_merged[correct_twi_mask])
na_merged[smaller_twi_mask]

Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
4014,The Twilight Saga: Eclipse,2010-06-30,Summit Entertainment,300531751,38090209,2010.0,thetwilightsagaecl,tt1325004,The Twilight Saga: Eclipse,The Twilight Saga: Eclipse,124,263704.0,5.1,3.0,Action,Adventure,Drama,Kristen Stewart,Robert Pattinson,Taylor Lautner,Xavier Samuel,Billy Burke,1.0,David Slade,,,2.0,Melissa Rosenberg,Stephenie Meyer,,0.0,thetwilightsagaecl,thetwilightsagaecl,thetwilightsagaecl,eclipsebisszumaben,eclipsi,twilightchapitreth,thetwilightsagaecl,thetwilightsagaecl,thetwilightsagaecl,sagazmierzchzacmie,alacakaranlik efsa,


Unnamed: 0,title,release_date,distributor,NA_gross_sales,NA_tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
4016,The Twilight Saga: Twilight…,2010-06-29,Summit Entertainment,2385237,302311,2010.0,thetwilightsagaecl,tt1325004,The Twilight Saga: Eclipse,The Twilight Saga: Eclipse,124,263704.0,5.1,3.0,Action,Adventure,Drama,Kristen Stewart,Robert Pattinson,Taylor Lautner,Xavier Samuel,Billy Burke,1.0,David Slade,,,2.0,Melissa Rosenberg,Stephenie Meyer,,0.0,thetwilightsagaecl,thetwilightsagaecl,thetwilightsagaecl,eclipsebisszumaben,eclipsi,twilightchapitreth,thetwilightsagaecl,thetwilightsagaecl,thetwilightsagaecl,sagazmierzchzacmie,alacakaranlik efsa,


In [96]:
na_merged.loc[correct_twi_mask, "NA_gross_sales"] =  (na_merged.loc[smaller_twi_mask, "NA_gross_sales"].values + na_merged.loc[correct_twi_mask, "NA_gross_sales"].values)[0]
na_merged.loc[correct_twi_mask, "NA_tickets_sold"] =  (na_merged.loc[smaller_twi_mask, "NA_tickets_sold"].values + na_merged.loc[correct_twi_mask, "NA_tickets_sold"].values)[0]

In [97]:
na_merged.drop(na_merged[smaller_twi_mask].index, inplace=True)

### After fixing, merging NA, too

In [99]:
imdb_tickets_df.head()

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,country,EU_since_1996,EU_tickets_sold
0,tt0013274,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,2021,94,73.0,6.7,1.0,Documentary,,,,,,,,2.0,Nikolai Izvolov,Dziga Vertov,,,,,,0,istoriya grazhdanskoy voyny,istoriya grazhdanskoy voyny,,,
1,tt0015414,La tierra de los toros,La tierra de los toros,2000,60,17.0,5.4,,,,,,,,,,1.0,Musidora,,,,,,,0,la tierra de los toros,la tierra de los toros,,,
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944.0,6.4,3.0,Comedy,Fantasy,Romance,Meg Ryan,Hugh Jackman,Liev Schreiber,Breckin Meyer,Natasha Lyonne,1.0,James Mangold,,,2.0,Steven Rogers,James Mangold,,0,kate leopold,kate leopold,US,2497656.0,2481644.0
3,tt0062336,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,2020,70,190.0,6.5,1.0,Drama,,,Rubén Sotoconil,Claudia Paz,Luis Alarcón,Shenda Román,Luis Vilches,2.0,Raúl Ruiz,Valeria Sarmiento,,2.0,Raúl Ruiz,Omar Saavedra Santis,,0,el tango del viudo y su espejo deformante,the tango of the widower and its distorting mi...,,,
4,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122,8143.0,6.7,1.0,Drama,,,John Huston,Oja Kodar,Peter Bogdanovich,Susan Strasberg,Norman Foster,1.0,Orson Welles,,,2.0,Orson Welles,Oja Kodar,,0,the other side of the wind,the other side of the wind,,,


In [100]:
# add NA
imdb_tickets_df = pd.merge(imdb_tickets_df, na_merged[["distributor", "NA_gross_sales", "NA_tickets_sold", "tconst"]], how="left", on="tconst")

In [101]:
display(imdb_df.shape)
imdb_tickets_df.shape

(188163, 27)

(188163, 33)

In [102]:
imdb_tickets_df["tconst"].duplicated(keep=False).sum()

0

In [108]:
tickets_data_mask = (imdb_tickets_df["EU_tickets_sold"].notnull()) | (imdb_tickets_df["NA_tickets_sold"].notnull())
imdb_tickets_df[tickets_data_mask]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,country,EU_since_1996,EU_tickets_sold,distributor,NA_gross_sales,NA_tickets_sold
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944.0,6.4,3.0,Comedy,Fantasy,Romance,Meg Ryan,Hugh Jackman,Liev Schreiber,Breckin Meyer,Natasha Lyonne,1.0,James Mangold,,,2.0,Steven Rogers,James Mangold,,0,kate leopold,kate leopold,US,2497656.0,2481644.0,Miramax,47095453.0,8245453.0
235,tt0117786,Mr. Nice Guy,Yat goh ho yan,1997,88,29154.0,6.2,3.0,Action,Adventure,Comedy,Jackie Chan,Richard Norton,Miki Lee,Karen McLymont,Gabrielle Fitzpatrick,1.0,Sammo Kam-Bo Hung,,,2.0,Fibe Ma,Edward Tang,,0,yat goh ho yan,mr nice guy,,,,New Line,12716953.0,2711503.0
264,tt0118301,Dead Man on Campus,Dead Man on Campus,1998,96,16474.0,6.0,1.0,Comedy,,,Tom Everett Scott,Mark-Paul Gosselaar,Poppy Montgomery,Lochlyn Munro,Randy Pearlstein,1.0,Alan Cohn,,,4.0,Anthony Abrams,Adam Larson Broder,,0,dead man on campus,dead man on campus,,,,Paramount Pictures,15064948.0,3212142.0
290,tt0118564,Affliction,Affliction,1997,114,19572.0,6.9,3.0,Drama,Mystery,Thriller,Nick Nolte,Sissy Spacek,James Coburn,Brigid Tierney,Holmes Osborne,1.0,Paul Schrader,,,2.0,Russell Banks,Paul Schrader,,0,affliction,affliction,,,,Lionsgate,6238175.0,1227987.0
307,tt0118589,Glitter,Glitter,2001,104,24170.0,2.4,3.0,Drama,Music,Romance,Mariah Carey,Eric Benét,Max Beesley,Da Brat,Tia Texada,1.0,Vondie Curtis-Hall,,,2.0,Cheryl L. West,Kate Lanier,,0,glitter,glitter,,,,20th Century Fox,4273372.0,755012.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187999,tt9883996,Dream Horse,Dream Horse,2020,113,7116.0,6.9,3.0,Biography,Comedy,Drama,Toni Collette,Owen Teale,Alan David,Lynda Baron,Damian Lewis,1.0,Euros Lyn,,,1.0,Neil McKay,,,0,dream horse,dream horse,,,,Bleecker Street,2913328.0,286462.0
188069,tt9896876,India Sweets and Spices,India Sweets and Spices,2021,101,868.0,6.1,1.0,Comedy,,,Sophia Ali,Rish Shah,Manisha Koirala,Adil Hussain,Christina Burdette,1.0,Geeta Malik,,,1.0,Geeta Malik,,,0,india sweets and spices,india sweets and spices,,,,Bleecker Street,288714.0,28388.0
188124,tt9907782,The Cursed,Eight for Silver,2021,111,18160.0,6.2,3.0,Fantasy,Horror,Mystery,Boyd Holbrook,Kelly Reilly,Alistair Petrie,Roxane Duran,Nigel Betts,1.0,Sean Ellis,,,1.0,Sean Ellis,,,0,eight for silver,the cursed,,,,LD Entertainment,4588389.0,435744.0
188125,tt9908390,Le lion,Le lion,2020,95,1443.0,5.5,1.0,Comedy,,,Dany Boon,Philippe Katerine,Anne Serra,Samuel Jouy,Sophie Verbeeck,1.0,Ludovic Colbeau-Justin,,,2.0,Alexandre Coquelle,Matthieu Le Naour,,0,le lion,le lion,"FR, BE",512213.0,511253.0,,,


In [109]:
imdb_tickets_df = imdb_tickets_df.loc[tickets_data_mask,:]
imdb_tickets_df.shape

(6341, 33)

In [110]:
imdb_tickets_df.head()

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,country,EU_since_1996,EU_tickets_sold,distributor,NA_gross_sales,NA_tickets_sold
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944.0,6.4,3.0,Comedy,Fantasy,Romance,Meg Ryan,Hugh Jackman,Liev Schreiber,Breckin Meyer,Natasha Lyonne,1.0,James Mangold,,,2.0,Steven Rogers,James Mangold,,0,kate leopold,kate leopold,US,2497656.0,2481644.0,Miramax,47095453.0,8245453.0
235,tt0117786,Mr. Nice Guy,Yat goh ho yan,1997,88,29154.0,6.2,3.0,Action,Adventure,Comedy,Jackie Chan,Richard Norton,Miki Lee,Karen McLymont,Gabrielle Fitzpatrick,1.0,Sammo Kam-Bo Hung,,,2.0,Fibe Ma,Edward Tang,,0,yat goh ho yan,mr nice guy,,,,New Line,12716953.0,2711503.0
264,tt0118301,Dead Man on Campus,Dead Man on Campus,1998,96,16474.0,6.0,1.0,Comedy,,,Tom Everett Scott,Mark-Paul Gosselaar,Poppy Montgomery,Lochlyn Munro,Randy Pearlstein,1.0,Alan Cohn,,,4.0,Anthony Abrams,Adam Larson Broder,,0,dead man on campus,dead man on campus,,,,Paramount Pictures,15064948.0,3212142.0
290,tt0118564,Affliction,Affliction,1997,114,19572.0,6.9,3.0,Drama,Mystery,Thriller,Nick Nolte,Sissy Spacek,James Coburn,Brigid Tierney,Holmes Osborne,1.0,Paul Schrader,,,2.0,Russell Banks,Paul Schrader,,0,affliction,affliction,,,,Lionsgate,6238175.0,1227987.0
307,tt0118589,Glitter,Glitter,2001,104,24170.0,2.4,3.0,Drama,Music,Romance,Mariah Carey,Eric Benét,Max Beesley,Da Brat,Tia Texada,1.0,Vondie Curtis-Hall,,,2.0,Cheryl L. West,Kate Lanier,,0,glitter,glitter,,,,20th Century Fox,4273372.0,755012.0


In [111]:
imdb_tickets_df.drop(columns=["original_title_merge", "primary_title_merge"], inplace=True)
imdb_tickets_df.head()

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,country,EU_since_1996,EU_tickets_sold,distributor,NA_gross_sales,NA_tickets_sold
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944.0,6.4,3.0,Comedy,Fantasy,Romance,Meg Ryan,Hugh Jackman,Liev Schreiber,Breckin Meyer,Natasha Lyonne,1.0,James Mangold,,,2.0,Steven Rogers,James Mangold,,0,US,2497656.0,2481644.0,Miramax,47095453.0,8245453.0
235,tt0117786,Mr. Nice Guy,Yat goh ho yan,1997,88,29154.0,6.2,3.0,Action,Adventure,Comedy,Jackie Chan,Richard Norton,Miki Lee,Karen McLymont,Gabrielle Fitzpatrick,1.0,Sammo Kam-Bo Hung,,,2.0,Fibe Ma,Edward Tang,,0,,,,New Line,12716953.0,2711503.0
264,tt0118301,Dead Man on Campus,Dead Man on Campus,1998,96,16474.0,6.0,1.0,Comedy,,,Tom Everett Scott,Mark-Paul Gosselaar,Poppy Montgomery,Lochlyn Munro,Randy Pearlstein,1.0,Alan Cohn,,,4.0,Anthony Abrams,Adam Larson Broder,,0,,,,Paramount Pictures,15064948.0,3212142.0
290,tt0118564,Affliction,Affliction,1997,114,19572.0,6.9,3.0,Drama,Mystery,Thriller,Nick Nolte,Sissy Spacek,James Coburn,Brigid Tierney,Holmes Osborne,1.0,Paul Schrader,,,2.0,Russell Banks,Paul Schrader,,0,,,,Lionsgate,6238175.0,1227987.0
307,tt0118589,Glitter,Glitter,2001,104,24170.0,2.4,3.0,Drama,Music,Romance,Mariah Carey,Eric Benét,Max Beesley,Da Brat,Tia Texada,1.0,Vondie Curtis-Hall,,,2.0,Cheryl L. West,Kate Lanier,,0,,,,20th Century Fox,4273372.0,755012.0


In [None]:
# Correction of dtypes in imdb_df
# TO INTEGER
imdb_tickets_df[["runtime","num_votes", "genres_count", "directors_count", "writers_count", "EU_since_1996", "EU_tickets_sold", "NA_tickets_sold"]] = imdb_tickets_df[["runtime","num_votes", "genres_count", "directors_count", "writers_count", "EU_since_1996", "EU_tickets_sold", "NA_tickets_sold"]].astype("Int64")
imdb_tickets_df.info()
# TO BOOLEAN
imdb_tickets_df["is_adult"] = imdb_tickets_df["is_adult"].astype(bool)
## As there are just false values in the column we drop it
imdb_tickets_df.drop(columns="is_adult", inplace = True)

### Upload final tickets DF

In [112]:
table_name = 'IMDB_tickets_data'
engine = sqlf.get_engine()
engine

Engine(postgresql://user:***@host/database)

In [113]:
schema

'capstone_24_4_group1'

In [114]:
# Write records stored in a dataframe to SQL database
if engine != None:
    try:
        imdb_tickets_df.to_sql(table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print("shit")

The IMDB_tickets_data table was imported successfully.


### Export t_cost_list as csv for usage in R

In [123]:
scraping_t_list = imdb_tickets_df["tconst"]
scraping_t_list

2         tt0035423
235       tt0117786
264       tt0118301
290       tt0118564
307       tt0118589
            ...    
187999    tt9883996
188069    tt9896876
188124    tt9907782
188125    tt9908390
188136    tt9911196
Name: tconst, Length: 6341, dtype: object

In [118]:
scraping_t_list.to_csv("Data/title.principals/scraping_t_list.csv")

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

In [124]:
box_office_data = pd.read_csv("Data/title.principals/box_office_data.csv")

In [125]:
box_office_data.head()

Unnamed: 0,tconst,region,value,release_group
0,tt0035423,Domestic,"$47,121,859",2.0
1,tt0035423,Germany,"$4,482,954",2.0
2,tt0035423,Italy,"$2,050,485",2.0
3,tt0035423,Spain,"$1,194,549",2.0
4,tt0035423,Czech Republic,"$732,897",2.0


In [126]:
table_name = 'box_office_data'

In [127]:
# Write records stored in a dataframe to SQL database
if engine != None:
    try:
        box_office_data.to_sql(table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print("shit")

The box_office_data table was imported successfully.
