# Combine only most critical steps of previous merging book into one.

In [1]:
# import all packages
import pandas as pd
import numpy as np
import psycopg2 as psycopg2
import sql_functions as sqlf
import string as string
from unidecode import unidecode
from num2words import num2words

### Import data

In [2]:
schema = "capstone_24_4_group1"
schema

'capstone_24_4_group1'

In [3]:
imdb_query = f'''   SELECT *
                    FROM {schema}."IMDB_data"
                    '''
eu_query = f'''   SELECT *
                    FROM {schema}."movie_data_EU"
                    '''
na_query = f'''   SELECT *
                    FROM {schema}."movie_data_NA"
                    '''
aka_query = f'''   SELECT *
                    FROM {schema}."imdb_akas_data"
                    '''

In [6]:
imdb_df = sqlf.get_dataframe(imdb_query)
eu_df = sqlf.get_dataframe(eu_query)
na_df = sqlf.get_dataframe(na_query)
aka_df = sqlf.get_dataframe(aka_query)

In [7]:
na_df.columns = ['title', 'release_date', 'distributor', 'gross_sales', 'tickets_sold', 'year']

In [8]:
# filter for years
eu_df = eu_df[eu_df["year"] >= 1998].reset_index(drop=True)
na_df = na_df[na_df["year"] >= 1998].reset_index(drop=True)

In [10]:
# Lower and change non-latin characters
imdb_df["original_title_merge"] = imdb_df["original_title"].str.lower().apply(unidecode)
eu_df["title_merge"] = eu_df["title"].str.lower().apply(unidecode)
na_df["title_merge"] = na_df["title"].str.lower().apply(unidecode)
imdb_df["primary_title_merge"] = imdb_df["primary_title"].str.lower().apply(unidecode)

In [11]:
# remove punctuations
imdb_df["original_title_merge"] = imdb_df["original_title_merge"].str.translate(str.maketrans("","",string.punctuation))
imdb_df["primary_title_merge"] = imdb_df["primary_title_merge"].str.translate(str.maketrans("","",string.punctuation))
eu_df["title_merge"] = eu_df["title_merge"].str.translate(str.maketrans("","",string.punctuation))
na_df["title_merge"] = na_df["title_merge"].str.translate(str.maketrans("","",string.punctuation))

adjust AKA_df

In [12]:
# lower, non-latin and special characters for aka_df
for column in aka_df.iloc[:,1:]:
    aka_df.loc[:,column] = aka_df[column].str.lower()
    aka_df.loc[:,column] = aka_df[column].astype(str).apply(unidecode)
    aka_df.loc[:,column] = aka_df.loc[:,column].str.translate(str.maketrans("","",string.punctuation))

In [13]:
# Merge aka with imdb
imdb_aka_df = pd.merge(imdb_df, aka_df, how="left", on="tconst")

### Remove double Spaces

In [14]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    display(imdb_aka_df[column].str.contains("  ").sum())
display(eu_df[eu_df["title_merge"].str.contains("  ")].shape)
na_df[na_df["title_merge"].str.contains("  ")].shape

4809

4517

741

7110

338

1452

1489

3479

312

474

(156, 6)

(53, 7)

In [15]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    imdb_aka_df.loc[:,column] = imdb_aka_df[column].str.replace("  ", " ")
    imdb_aka_df.loc[:,column] = imdb_aka_df[column].str.replace("  ", " ")
eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace("  ", " ")
na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace("  ", " ")

In [16]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    display(imdb_aka_df[column].str.contains("  ").sum())
display(eu_df[eu_df["title_merge"].str.contains("  ")].shape)
na_df[na_df["title_merge"].str.contains("  ")].shape

0

0

0

0

0

0

0

0

0

0

(2, 6)

(0, 7)

### Remove Roman and Latin Numbers

In [17]:
testing_dict = {r'(^ii | ii | ii$)': "two",
                r'(^iii | iii | iii$)': "three",
                r'(^iv | iv | iv$)': "four",
                r'(^v | v | v$)': "five",
                r'(^vi | vi | vi$)': "six",
                r'(^vii | vii | vii$)': "seven",
                r'(^viii | viii | viii$)': "eight",
                r'(^ix | ix | ix$)': "nine",
                r'(^x | x | x$)': "ten",}

In [18]:
imdb_aka_df[imdb_aka_df["original_title"].str.contains("Jurassic Park")]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
890,tt0119567,The Lost World: Jurassic Park,The Lost World: Jurassic Park,1997,129,449430.0,6.6,3.0,Action,Adventure,...,the lost world jurassic park,vergessene welt jurassic park,el mundo perdido jurassic park,le monde perdu,the lost world jurassic park,il mondo perduto jurassic park,the lost world jurassic park,park jurajski ii,jurassic park 2 kayip dunya,jurassic park ii
3807,tt0163025,Jurassic Park III,Jurassic Park III,2001,92,344970.0,5.9,3.0,Action,Adventure,...,le parc jurassique iii,jurassic park iii,jurassic park iii parque jurasico iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park 3,jurassic park 3
141839,tt4130956,Jurassic Park: Operation Rebirth,Jurassic Park: Operation Rebirth,2014,70,106.0,6.7,1.0,Thriller,,...,jurassic park operation rebirth,,,,jurassic park operation rebirth,,,,,


In [19]:
list_titles = ['original_title_merge', 'primary_title_merge', 'CA','DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', 'ALTER']

for title in list_titles:
    for key, value in testing_dict.items():
        imdb_aka_df.loc[:,title] = imdb_aka_df[title].str.replace(key, value, regex=True)

for key, value in testing_dict.items():
        eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace(key, value, regex=True)

for key, value in testing_dict.items():
        na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace(key, value, regex=True)

In [20]:
imdb_aka_df[imdb_aka_df["original_title"].str.contains("Jurassic Park")]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
890,tt0119567,The Lost World: Jurassic Park,The Lost World: Jurassic Park,1997,129,449430.0,6.6,3.0,Action,Adventure,...,the lost world jurassic park,vergessene welt jurassic park,el mundo perdido jurassic park,le monde perdu,the lost world jurassic park,il mondo perduto jurassic park,the lost world jurassic park,park jurajskitwo,jurassic park 2 kayip dunya,jurassic parktwo
3807,tt0163025,Jurassic Park III,Jurassic Park III,2001,92,344970.0,5.9,3.0,Action,Adventure,...,le parc jurassiquethree,jurassic parkthree,jurassic parkthreeparque jurasicothree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic park 3,jurassic park 3
141839,tt4130956,Jurassic Park: Operation Rebirth,Jurassic Park: Operation Rebirth,2014,70,106.0,6.7,1.0,Thriller,,...,jurassic park operation rebirth,,,,jurassic park operation rebirth,,,,,


In [21]:
list_titles = ['original_title_merge', 'primary_title_merge','CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', 'ALTER']

for title in list_titles:
    imdb_aka_df.loc[:,title] = imdb_aka_df[title].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

Drop Oscar "Movies"

In [22]:
na_df.drop(na_df[na_df["distributor"].str.contains("Shorts")].index, inplace=True)

# THE FUNCTION

In [23]:
def ultimate_merge_func(data, imdb_base, number_of_columns=6, short=False):
    '''
    Merges our Tickets Sold Dataframes First on each title for year and year +/- 1

    Input:
        ticket_data ... either eu or na depending on what we are testing
        base_data = imdb_df
        number_of_columns = 6 ... How many columns does the df with the ticket data have
        short = False ... if true, shorten the titles to merge to ... 25?

    Output:
        Returns: Dataframe with all matches from both columns
        Prints: Unmatched rows
    '''
    ticket_data = data.copy()
    base_data = imdb_base.copy()

    list_titles = ['original_title_merge', 'primary_title_merge','CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', "ALTER"]
    counter = 0

    # remove all spaces from all titles
    #imdb
    for column in base_data.iloc[:,-12:-2]:
        base_data.loc[:,column] = base_data[column].str.replace(" ", "")
        base_data.loc[:,column] = base_data[column].str.replace(" ", "")
    # ticket data
    ticket_data.loc[:,"title_merge"] = ticket_data["title_merge"].str.replace(" ", "")

    # add two year plus and minus as extra rows
    base_data["year_plus"] = base_data["year"] + 1
    base_data["year_minus"] = base_data["year"] + -1

    list_years = ["year_minus", "year", "year_plus"]

    if short == True:
        for title in list_titles:
            base_data.loc[:,title] = base_data[title].str[:18]
        ticket_data.loc[:,"title_merge"] = ticket_data["title_merge"].str[:18]

    for title in list_titles:
        if counter == 0:
            check_df = pd.merge(ticket_data,base_data, how="left", left_on=["title_merge", "year"], right_on=[title, "year"])

            
            # set-up third df for further calculations
            check_mask = check_df["tconst"].isnull()
            third_merge_df = check_df[check_mask].copy()

            # drop columns from first merge
            third_merge_df.drop(columns = third_merge_df.iloc[:,number_of_columns:], inplace=True)
            
            # 3_plus merge (inner)
            third_merge_plus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, "year_plus"])
            # drop columns from first merge table that matched on 3_plus merge (inner)
            third_merge_plus_mask = check_df["title"].isin(list(third_merge_plus_df["title"]))
            check_df.drop(check_df[third_merge_plus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_plus_df])
            check_df.reset_index(drop=True, inplace=True)
            check_df.drop(columns = "year_x", inplace=True)

            # 3_minus merge (inner)
            third_merge_minus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, "year_minus"])
            # drop columns from first merge table that matched on 3_minus merge (inner)
            third_merge_minus_mask = check_df["title"].isin(list(third_merge_minus_df["title"]))
            check_df.drop(check_df[third_merge_minus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_minus_df])
            check_df.reset_index(drop=True, inplace=True)
            check_df.drop(columns = "year_x", inplace=True)

            counter += 1

        for year in list_years:
            # set-up third df for further calculations
            check_mask = check_df["tconst"].isnull()
            third_merge_df = check_df[check_mask].copy()

            # drop columns from first merge
            third_merge_df.drop(columns = third_merge_df.iloc[:,number_of_columns:], inplace=True)
            
            # merge again
            third_merge_plus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, year])
            # drop columns from first merge table that matched on 3_plus merge (inner)
            third_merge_plus_mask = check_df["title"].isin(list(third_merge_plus_df["title"]))
            check_df.drop(check_df[third_merge_plus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_plus_df])
            check_df.reset_index(drop=True, inplace=True)
        counter += 1

    # drop new year columns from final table
    check_df.drop(columns = check_df.iloc[:,-4:], inplace=True)

    # show unmatched rows
    check_mask = check_df["tconst"].isnull()
    display(check_df[check_mask])
    display(counter)
    
    return check_df

### Check if it works as expected

In [24]:
eu_check = ultimate_merge_func(eu_df, imdb_aka_df)

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
21,15 Minutes (Fifteen Minutes),US,2001.0,2600419,2565397,onefiveminutesfifteenminutes,,,,,...,,,,,,,,,,
65,5X2 cinq fois deux,FR,2004.0,1150178,814942,fivextwocinqfoisdeux,,,,,...,,,,,,,,,,
293,Arthur et la guerre des deux mondes,FR,2010.0,3838378,3363498,arthuretlaguerredesdeuxmondes,,,,,...,,,,,,,,,,
296,Artificial Intelligence: AI,US,2001.0,8073605,8041431,artificialintelligenceai,,,,,...,,,,,,,,,,
320,Atatürk 1881 - 1919,TR,2023.0,1732649,1732649,ataturkoneeighteightoneonenineonenine,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3997,Tinker Bell and the Pirate Fairy,US,2014.0,5489166,5472379,tinkerbellandthepiratefairy,,,,,...,,,,,,,,,,
4193,Wallace & Gromit in The Curse of the Were-Rabbit,"GBinc, US",2005.0,14014825,13251997,wallacegromitinthecurseofthewererabbit,,,,,...,,,,,,,,,,
4204,Warum Männer nicht zuhören und Frauen schlecht...,DE,2007.0,1452342,1068475,warummannernichtzuhorenundfrauenschlechtereinp...,,,,,...,,,,,,,,,,
4322,Zeny v behu,CZ,2019.0,1705959,1675569,zenyfivebehu,,,,,...,,,,,,,,,,


13

In [25]:
na_check = ultimate_merge_func(na_df, imdb_aka_df, number_of_columns=7, short=True)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
31,21 and Over,2013-03-01,Relativity,25682380,3158964,2013.0,twooneandover,,,,...,,,,,,,,,,
54,63 Up,2019-11-27,BritBox,183940,20037,2019.0,sixthreeup,,,,...,,,,,,,,,,
69,A Common Thread,2002-11-29,Odeon Films,5058187,838836,2002.0,acommonthread,,,,...,,,,,,,,,,
107,A Rescue of Little Eggs,2021-08-27,Lionsgate,927154,91166,2021.0,arescueoflittleegg,,,,...,,,,,,,,,,
114,A Stir of Echoes,1999-09-10,Artisan,21133087,4160056,1999.0,astirofechoes,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3970,Y Tu Mama Tambien (And Your…,2002-03-15,IFC Films,13649881,2349377,2002.0,ytumamatambienandy,,,,...,,,,,,,,,,
3983,You're Next,2013-08-23,Lionsgate,18494006,2274785,2013.0,yourenext,,,,...,,,,,,,,,,
3989,"Yours, Mine and Ours",2005-11-23,Paramount Pictures,50733384,7914724,2005.0,yoursmineandours,,,,...,,,,,,,,,,
3991,Yu-Gi-Oh,2004-08-13,Warner Bros.,19762690,3182397,2004.0,yugioh,,,,...,,,,,,,,,,


13

### Seperate NA into upper and lower

In [33]:
na_upper_df = na_df.sort_values(by="release_date").iloc[:2250].copy()
na_upper_df

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge
1224,Firestorm,1998-01-09,20th Century Fox,8123860,1732166,1998,firestorm
1470,Hard Rain,1998-01-16,Paramount Pictures,19870567,4236794,1998,hard rain
1164,Fallen,1998-01-16,Warner Bros.,25310938,5396788,1998,fallen
1441,Half Baked,1998-01-16,Universal,17394881,3708929,1998,half baked
3086,Star Kid,1998-01-16,Trimark,7015240,1495786,1998,star kid
...,...,...,...,...,...,...,...
4041,The Town,2010-09-17,Warner Bros.,92173235,11682286,2010,the town
1061,Easy A,2010-09-17,Sony Pictures,58401464,7401960,2010,easy a
4510,You Will Meet a Tall Dark S…,2010-09-22,Sony Pictures Cla…,3229586,409326,2010,you will meet a tall dark s
4505,You Again,2010-09-24,Walt Disney,25702053,3257547,2010,you again


In [34]:
na_lower_df = na_df.sort_values(by="release_date").iloc[2250:].copy()
na_lower_df

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge
1961,Legend of the Guardians: Th…,2010-09-24,Warner Bros.,55549823,7040535,2010,legend of the guardians th
4342,Waiting for Superman,2010-09-24,Paramount Vantage,6417135,813325,2010,waiting for superman
675,Case 39,2010-10-01,Paramount Vantage,13261851,1680843,2010,case threenine
2769,Robot,2010-10-01,B4U Movies,2276427,288520,2010,robot
1971,Let Me In,2010-10-01,Overture Films,12134935,1538015,2010,let me in
...,...,...,...,...,...,...,...
307,Anyone But You,2023-12-22,Sony Pictures,24837385,2304025,2023,anyone but you
2196,Migration,2023-12-22,Universal,54103955,5018919,2023,migration
3379,The Color Purple,2023-12-25,Warner Bros.,44047642,4086052,2023,the color purple
1197,Ferrari,2023-12-25,Neon,10778480,999858,2023,ferrari


### Work on upper half

In [249]:
test = ultimate_merge_func(na_lower_df, imdb_aka_df, number_of_columns=7, short=True)
# ultimate_merge_func(na_upper_df, imdb_aka_df, number_of_columns=7, short=True)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,...,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER


13

In [250]:
test[test["tconst"].isnull()].head(10)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,...,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER


In [58]:
pd.set_option('display.max_columns', 40)

In [247]:
imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt4495098"]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
145614,tt4495098,Gran Turismo,Gran Turismo,2023,134,109255.0,7.1,3.0,Action,Adventure,Drama,David Harbour,Orlando Bloom,Archie Madekwe,Takehiro Hira,Darren Barnet,1.0,Neill Blomkamp,,,3.0,Jason Hall,Zach Baylin,Alex Tse,0,gran turismo,gran turismo,gran turismo,gran turismo,gran turismo,gran turismo,gran turismo,gran turismo la storia di un sogno impossibile,,,gran turismo,


In [226]:
basics_df[basics_df["tconst"] == "tt26998517"]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
6406021,tt26998517,short,La Piscine,La Piscine,0.0,2022.0,\N,14,"Comedy,Short"


### Change all na_lower to correct name

### First 10

In [44]:
na_lower_df.loc[na_lower_df["title"] == "Gnomeo and Juliet", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0377981", "primary_title_merge"].values[0]

In [54]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Born to be Wild 3D"].index, inplace=True)

In [60]:
na_lower_df.loc[na_lower_df["title"] == "Atlas Shrugged: Part 1", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0480239", "primary_title_merge"].values[0]

In [62]:
na_lower_df.loc[na_lower_df["title"] == "Cowboys and Aliens", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0409847", "primary_title_merge"].values[0]

In [66]:
na_lower_df.loc[na_lower_df["title"] == "Spy Kids: All the Time in t…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1517489", "primary_title_merge"].values[0]

In [68]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Flying Monsters 3D"].index, inplace=True)

In [71]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "George Balanchine's The Nut…"].index, inplace=True)

In [73]:
na_lower_df.loc[na_lower_df["title"] == "Extremely Loud and Incredib…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0477302", "primary_title_merge"].values[0]

In [76]:
na_lower_df.loc[na_lower_df["title"] == "The Secret World of Arrietty", "year"] = 2010

In [78]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry's Good Deeds", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1885265", "primary_title_merge"].values[0]

### Second 10

In [82]:
na_lower_df.loc[na_lower_df["title"] == "Dr. Seuss' The Lorax", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1482459", "primary_title_merge"].values[0]

In [84]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "To the Arctic 3D"].index, inplace=True)

In [88]:
na_lower_df.loc[na_lower_df["title"] == "For Greater Glory", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1566501", "primary_title_merge"].values[0]

In [90]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry's Madea's Witne…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2215285", "primary_title_merge"].values[0]

In [92]:
na_lower_df.loc[na_lower_df["title"] == "Celeste and Jesse Forever", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1405365", "primary_title_merge"].values[0]

In [94]:
na_lower_df.loc[na_lower_df["title"] == "Robot and Frank", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1990314", "primary_title_merge"].values[0]

In [96]:
na_lower_df.loc[na_lower_df["title"] == "Atlas Shrugged: Part II", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1985017", "primary_title_merge"].values[0]

In [97]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Met: Live in HD - Aida"].index, inplace=True)

In [99]:
na_lower_df.loc[na_lower_df["title"] == "21 and Over", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1711425", "primary_title_merge"].values[0]

In [101]:
na_lower_df.loc[na_lower_df["title"] == "The Hangover 3", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1951261", "primary_title_merge"].values[0]

### Third 10

In [105]:
na_lower_df.loc[na_lower_df["title"] == "Fast and Furious 6", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1905041", "primary_title_merge"].values[0]

In [107]:
na_lower_df.loc[na_lower_df["title"] == "Disney Planes", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1691917", "primary_title_merge"].values[0]

In [109]:
na_lower_df.loc[na_lower_df["title"] == "Lee Daniels' The Butler", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1327773", "primary_title_merge"].values[0]

In [112]:
na_lower_df.loc[na_lower_df["title"] == "You're Next", "year"] = 2011

In [115]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Jerusalem"].index, inplace=True)

In [118]:
na_lower_df.loc[na_lower_df["title"] == "Battle of the Year 3D", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt1532958", "primary_title_merge"].values[0]

In [120]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Journey to the South Pacific"].index, inplace=True)

In [122]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry's The Single Mo…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2465140", "primary_title_merge"].values[0]

In [124]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Island of Lemurs: Madagascar"].index, inplace=True)

In [126]:
na_lower_df.loc[na_lower_df["title"] == "America: Imagine a World Wi…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2785390", "primary_title_merge"].values[0]

### Fourth 10 Films

In [130]:
na_lower_df.loc[na_lower_df["title"] == "Planes: Fire and Rescue", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2980706", "primary_title_merge"].values[0]

In [132]:
na_lower_df.loc[na_lower_df["title"] == "The Fluffy Movie", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt3532608", "primary_title_merge"].values[0]

In [134]:
na_lower_df.loc[na_lower_df["title"] == "The Divergent Serires: Insu…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2908446", "primary_title_merge"].values[0]

In [136]:
na_lower_df.loc[na_lower_df["title"] == "Shaun the Sheep", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2872750", "primary_title_merge"].values[0]

In [138]:
na_lower_df.loc[na_lower_df["title"] == "The Green Inferno", "year"] = 2013

In [140]:
na_lower_df.loc[na_lower_df["title"] == "Star Wars Ep. VII: The Forc…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2488496", "primary_title_merge"].values[0]

In [142]:
na_lower_df.loc[na_lower_df["title"] == "Ratchet and Clank", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2865120", "primary_title_merge"].values[0]

In [144]:
na_lower_df.loc[na_lower_df["title"] == "The Conjuring 2: The Enfiel…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt3065204", "primary_title_merge"].values[0]

In [147]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Queen of Katwe"].index, inplace=True)

In [149]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry’s Boo! A Madea …", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt5325452", "primary_title_merge"].values[0]

### Fifth 10 Films

In [152]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Mayweather vs. McGregor"].index, inplace=True)

In [154]:
na_lower_df.loc[na_lower_df["title"] == "Victoria and Abdul", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt5816682", "primary_title_merge"].values[0]

In [157]:
na_lower_df.loc[na_lower_df["title"] == "Star Wars Ep. VIII: The Las…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2527336", "primary_title_merge"].values[0]

In [161]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Pandas"].index, inplace=True)

In [163]:
na_lower_df.loc[na_lower_df["title"] == "The Old Man and the Gun", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2837574", "primary_title_merge"].values[0]

In [165]:
na_lower_df.loc[na_lower_df["title"] == "The Upside", "year"] = 2017

In [167]:
na_lower_df.loc[na_lower_df["title"] == "Tyler Perry’s A Madea Famil…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt7054636", "primary_title_merge"].values[0]

In [169]:
na_lower_df.loc[na_lower_df["title"] == "Pain & Glory", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt8291806", "primary_title_merge"].values[0]

In [171]:
na_lower_df.loc[na_lower_df["title"] == "The Current War: Director’s…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2140507", "primary_title_merge"].values[0]

In [174]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "63 Up"].index, inplace=True)

### Sixth 10 Films

In [178]:
na_lower_df.loc[na_lower_df["title"] == "The Current War: Director’s…", "year"] = 2017

In [180]:
na_lower_df.loc[na_lower_df["title"] == "IP Man: The Finale", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt2076298", "primary_title_merge"].values[0]

In [182]:
na_lower_df.loc[na_lower_df["title"] == "Thank You For Everything", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11349958", "primary_title_merge"].values[0]

In [183]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Met Opera: Wozzeck"].index, inplace=True)

In [185]:
na_lower_df.loc[na_lower_df["title"] == "The Doors: Break on Thru — …", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt7903550", "primary_title_merge"].values[0]

In [187]:
na_lower_df.loc[na_lower_df["title"] == "The Times of Bill Cunningham", "year"] = 2018

In [189]:
na_lower_df.loc[na_lower_df["title"] == "Burden", "year"] = 2018

In [190]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Met Opera — Agrippina"].index, inplace=True)

In [192]:
na_lower_df.loc[na_lower_df["title"] == "Followed", "year"] = 2018

In [194]:
na_lower_df.loc[na_lower_df["title"] == "Murder in the Woods", "year"] = 2017

### Seventh 10 Films

In [200]:
na_lower_df.loc[na_lower_df["title"] == "The Doors: Break on Thru — …", "year"] = 2018

In [202]:
na_lower_df.loc[na_lower_df["title"] == "Train to Busan Presents: Pe…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt8850222", "primary_title_merge"].values[0]

In [204]:
na_lower_df.loc[na_lower_df["title"] == "Legend of Deification", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11177804", "primary_title_merge"].values[0]

In [206]:
na_lower_df.loc[na_lower_df["title"] == "True to the Game 2: Gena’s …", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11237714", "primary_title_merge"].values[0]

In [208]:
na_lower_df.loc[na_lower_df["title"] == "Missão: Vingança", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt10310140", "primary_title_merge"].values[0]

In [211]:
na_lower_df.loc[na_lower_df["title"] == "Always & Forever", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt7544954", "primary_title_merge"].values[0]

In [213]:
na_lower_df.loc[na_lower_df["title"] == "American Skin", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt10195452", "primary_title_merge"].values[0]

In [214]:
na_lower_df.loc[na_lower_df["title"] == "American Skin", "year"] = 2019

In [216]:
na_lower_df.loc[na_lower_df["title"] == "Our Friend", "year"] = 2019

In [218]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Earwig and the Witch"].index, inplace=True)

In [220]:
na_lower_df.loc[na_lower_df["title"] == "City of Lies", "year"] = 2018

### Eight 10 Films

In [224]:
na_lower_df.loc[na_lower_df["title"] == "Demon Slayer The Movie: Mug…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11032374", "primary_title_merge"].values[0]

In [227]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "La piscine"].index, inplace=True)

In [229]:
na_lower_df.loc[na_lower_df["title"] == "Profile", "year"] = 2018

In [231]:
na_lower_df.loc[na_lower_df["title"] == "A Rescue of Little Eggs", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt8049994", "primary_title_merge"].values[0]

In [232]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Met Opera: Fire Shut Up in …"].index, inplace=True)

In [234]:
na_lower_df.loc[na_lower_df["title"] == "American Underdog: The Kurt…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt11729298", "primary_title_merge"].values[0]

In [235]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Servant"].index, inplace=True)

In [237]:
na_lower_df.loc[na_lower_df["title"] == "Y Como Es El", "year"] = 2020

In [239]:
na_lower_df.loc[na_lower_df["title"] == "Ante Sundaraniki", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt13528564", "primary_title_merge"].values[0]

In [240]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Chosen Season 3: Episod…"].index, inplace=True)

### Last movies

In [243]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "Met Opera: The Hours"].index, inplace=True)

In [244]:
na_lower_df.drop(na_lower_df[na_lower_df["title"] == "The Chosen Season 3 Finale"].index, inplace=True)

In [246]:
na_lower_df.loc[na_lower_df["title"] == "Big George Foreman: The Mir…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt12226632", "primary_title_merge"].values[0]

In [248]:
na_lower_df.loc[na_lower_df["title"] == "Gran Turismo: Based on a Tr…", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt4495098", "primary_title_merge"].values[0]

### Upload lower NAs to SQL

In [251]:
table_name = 'na_merged_lower'
engine = sqlf.get_engine()
engine

Engine(postgresql://user:***@host/database)

In [252]:
schema

'capstone_24_4_group1'

In [253]:
# Write records stored in a dataframe to SQL database
if engine != None:
    try:
        test.to_sql(table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print("shit")

The na_merged_lower table was imported successfully.


In [258]:
imdb_aka_df.head()

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,genre3,acting,acting2,acting3,acting4,acting5,directors_count,director_name,director2_name,director3_name,writers_count,writer_name,writer2_name,writer3_name,is_adult,original_title_merge,primary_title_merge,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
0,tt0013274,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,2021,94,73.0,6.7,1.0,Documentary,,,,,,,,2.0,Nikolai Izvolov,Dziga Vertov,,,,,,0,istoriya grazhdanskoy voyny,istoriya grazhdanskoy voyny,,,,histoire de la guerre civile,,,,,,
1,tt0015414,La tierra de los toros,La tierra de los toros,2000,60,17.0,5.4,,,,,,,,,,1.0,Musidora,,,,,,,0,la tierra de los toros,la tierra de los toros,,,la tierra de los toros,la terre des taureaux,,,,,,
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944.0,6.4,3.0,Comedy,Fantasy,Romance,Meg Ryan,Hugh Jackman,Liev Schreiber,Breckin Meyer,Natasha Lyonne,1.0,James Mangold,,,2.0,Steven Rogers,James Mangold,,0,kate leopold,kate leopold,kate et leopold,kate und leopold,la kate i en leopold,kate et leopold,kate leopold,kate and leopold,,kate i leopold,buyulu cift,
3,tt0062336,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,2020,70,190.0,6.5,1.0,Drama,,,Rubén Sotoconil,Claudia Paz,Luis Alarcón,Shenda Román,Luis Vilches,2.0,Raúl Ruiz,Valeria Sarmiento,,2.0,Raúl Ruiz,Omar Saavedra Santis,,0,el tango del viudo y su espejo deformante,the tango of the widower and its distorting mi...,,,,el tango del viudo y su espejo deformante,the tango of the widower and its distorting mi...,,,,,
4,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122,8143.0,6.7,1.0,Drama,,,John Huston,Oja Kodar,Peter Bogdanovich,Susan Strasberg,Norman Foster,1.0,Orson Welles,,,2.0,Orson Welles,Oja Kodar,,0,the other side of the wind,the other side of the wind,the other side of the wind,the other side of the wind,al otro lado del viento,de lautre cote du vent,the other side of the wind,laltra faccia del vento,,druga strona wiatru,,
