# Combine only most critical steps of previous merging book into one.

In [3]:
# import all packages
import pandas as pd
import numpy as np
import psycopg2 as psycopg2
import sql_functions as sqlf
from unidecode import unidecode
import string as string
from num2words import num2words

### Import data

In [4]:
schema = "capstone_24_4_group1"
schema

'capstone_24_4_group1'

In [5]:
imdb_query = f'''   SELECT *
                    FROM {schema}."IMDB_data"
                    '''
eu_query = f'''   SELECT *
                    FROM {schema}."movie_data_EU"
                    '''
na_query = f'''   SELECT *
                    FROM {schema}."movie_data_NA"
                    '''
aka_query = f'''   SELECT *
                    FROM {schema}."imdb_akas_data"
                    '''

In [6]:
imdb_df = sqlf.get_dataframe(imdb_query)
eu_df = sqlf.get_dataframe(eu_query)
na_df = sqlf.get_dataframe(na_query)
aka_df = sqlf.get_dataframe(aka_query)

In [7]:
na_df.columns = ['title', 'release_date', 'distributor', 'gross_sales', 'tickets_sold', 'year']

In [8]:
# filter for years
eu_df = eu_df[eu_df["year"] >= 1998].reset_index(drop=True)
na_df = na_df[na_df["year"] >= 1998].reset_index(drop=True)

In [9]:
# Lower and change non-latin characters
imdb_df["original_title_merge"] = imdb_df["original_title"].str.lower().apply(unidecode)
eu_df["title_merge"] = eu_df["title"].str.lower().apply(unidecode)
na_df["title_merge"] = na_df["title"].str.lower().apply(unidecode)
imdb_df["primary_title_merge"] = imdb_df["primary_title"].str.lower().apply(unidecode)

In [10]:
# remove punctuations
imdb_df["original_title_merge"] = imdb_df["original_title_merge"].str.translate(str.maketrans("","",string.punctuation))
imdb_df["primary_title_merge"] = imdb_df["primary_title_merge"].str.translate(str.maketrans("","",string.punctuation))
eu_df["title_merge"] = eu_df["title_merge"].str.translate(str.maketrans("","",string.punctuation))
na_df["title_merge"] = na_df["title_merge"].str.translate(str.maketrans("","",string.punctuation))

adjust AKA_df

In [11]:
# lower, non-latin and special characters for aka_df
for column in aka_df.iloc[:,1:]:
    aka_df.loc[:,column] = aka_df[column].str.lower()
    aka_df.loc[:,column] = aka_df[column].astype(str).apply(unidecode)
    aka_df.loc[:,column] = aka_df.loc[:,column].str.translate(str.maketrans("","",string.punctuation))

In [12]:
# Merge aka with imdb
imdb_aka_df = pd.merge(imdb_df, aka_df, how="left", on="tconst")

### Remove double Spaces

In [13]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    display(imdb_aka_df[column].str.contains("  ").sum())
display(eu_df[eu_df["title_merge"].str.contains("  ")].shape)
na_df[na_df["title_merge"].str.contains("  ")].shape

4809

4517

741

7110

338

1452

1489

3479

312

474

(156, 6)

(53, 7)

In [14]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    imdb_aka_df.loc[:,column] = imdb_aka_df[column].str.replace("  ", " ")
    imdb_aka_df.loc[:,column] = imdb_aka_df[column].str.replace("  ", " ")
eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace("  ", " ")
na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace("  ", " ")

In [15]:
for column in imdb_aka_df.iloc[:,-12:-2]:
    display(imdb_aka_df[column].str.contains("  ").sum())
display(eu_df[eu_df["title_merge"].str.contains("  ")].shape)
na_df[na_df["title_merge"].str.contains("  ")].shape

0

0

0

0

0

0

0

0

0

0

(2, 6)

(0, 7)

### Remove Roman and Latin Numbers

In [16]:
testing_dict = {r'(^ii | ii | ii$)': "two",
                r'(^iii | iii | iii$)': "three",
                r'(^iv | iv | iv$)': "four",
                r'(^v | v | v$)': "five",
                r'(^vi | vi | vi$)': "six",
                r'(^vii | vii | vii$)': "seven",
                r'(^viii | viii | viii$)': "eight",
                r'(^ix | ix | ix$)': "nine",
                r'(^x | x | x$)': "ten",}

In [17]:
imdb_aka_df[imdb_aka_df["original_title"].str.contains("Jurassic Park")]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
890,tt0119567,The Lost World: Jurassic Park,The Lost World: Jurassic Park,1997,129,449430.0,6.6,3.0,Action,Adventure,...,the lost world jurassic park,vergessene welt jurassic park,el mundo perdido jurassic park,le monde perdu,the lost world jurassic park,il mondo perduto jurassic park,the lost world jurassic park,park jurajski ii,jurassic park 2 kayip dunya,jurassic park ii
3807,tt0163025,Jurassic Park III,Jurassic Park III,2001,92,344970.0,5.9,3.0,Action,Adventure,...,le parc jurassique iii,jurassic park iii,jurassic park iii parque jurasico iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park iii,jurassic park 3,jurassic park 3
141839,tt4130956,Jurassic Park: Operation Rebirth,Jurassic Park: Operation Rebirth,2014,70,106.0,6.7,1.0,Thriller,,...,jurassic park operation rebirth,,,,jurassic park operation rebirth,,,,,


In [18]:
list_titles = ['original_title_merge', 'primary_title_merge', 'CA','DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', 'ALTER']

for title in list_titles:
    for key, value in testing_dict.items():
        imdb_aka_df.loc[:,title] = imdb_aka_df[title].str.replace(key, value, regex=True)

for key, value in testing_dict.items():
        eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace(key, value, regex=True)

for key, value in testing_dict.items():
        na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace(key, value, regex=True)

In [19]:
imdb_aka_df[imdb_aka_df["original_title"].str.contains("Jurassic Park")]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
890,tt0119567,The Lost World: Jurassic Park,The Lost World: Jurassic Park,1997,129,449430.0,6.6,3.0,Action,Adventure,...,the lost world jurassic park,vergessene welt jurassic park,el mundo perdido jurassic park,le monde perdu,the lost world jurassic park,il mondo perduto jurassic park,the lost world jurassic park,park jurajskitwo,jurassic park 2 kayip dunya,jurassic parktwo
3807,tt0163025,Jurassic Park III,Jurassic Park III,2001,92,344970.0,5.9,3.0,Action,Adventure,...,le parc jurassiquethree,jurassic parkthree,jurassic parkthreeparque jurasicothree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic parkthree,jurassic park 3,jurassic park 3
141839,tt4130956,Jurassic Park: Operation Rebirth,Jurassic Park: Operation Rebirth,2014,70,106.0,6.7,1.0,Thriller,,...,jurassic park operation rebirth,,,,jurassic park operation rebirth,,,,,


In [20]:
list_titles = ['original_title_merge', 'primary_title_merge','CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', 'ALTER']

for title in list_titles:
    imdb_aka_df.loc[:,title] = imdb_aka_df[title].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

eu_df.loc[:,"title_merge"] = eu_df["title_merge"].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

na_df.loc[:,"title_merge"] = na_df["title_merge"].str.replace(r'\d', lambda x: num2words(int(x.group())), regex=True)

Drop Oscar "Movies"

In [21]:
na_df.drop(na_df[na_df["distributor"].str.contains("Shorts")].index, inplace=True)

# THE FUNCTION

In [22]:
def ultimate_merge_func(data, imdb_base, number_of_columns=6, short=False):
    '''
    Merges our Tickets Sold Dataframes First on each title for year and year +/- 1

    Input:
        ticket_data ... either eu or na depending on what we are testing
        base_data = imdb_df
        number_of_columns = 6 ... How many columns does the df with the ticket data have
        short = False ... if true, shorten the titles to merge to ... 25?

    Output:
        Returns: Dataframe with all matches from both columns
        Prints: Unmatched rows
    '''
    ticket_data = data.copy()
    base_data = imdb_base.copy()

    list_titles = ['original_title_merge', 'primary_title_merge','CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PL', 'TR', "ALTER"]
    counter = 0

    # remove all spaces from all titles
    #imdb
    for column in base_data.iloc[:,-12:-2]:
        base_data.loc[:,column] = base_data[column].str.replace(" ", "")
        base_data.loc[:,column] = base_data[column].str.replace(" ", "")
    # ticket data
    ticket_data.loc[:,"title_merge"] = ticket_data["title_merge"].str.replace(" ", "")

    # add two year plus and minus as extra rows
    base_data["year_plus"] = base_data["year"] + 1
    base_data["year_minus"] = base_data["year"] + -1

    list_years = ["year_minus", "year", "year_plus"]

    if short == True:
        for title in list_titles:
            base_data.loc[:,title] = base_data[title].str[:18]
        ticket_data.loc[:,"title_merge"] = ticket_data["title_merge"].str[:18]

    for title in list_titles:
        if counter == 0:
            check_df = pd.merge(ticket_data,base_data, how="left", left_on=["title_merge", "year"], right_on=[title, "year"])

            
            # set-up third df for further calculations
            check_mask = check_df["tconst"].isnull()
            third_merge_df = check_df[check_mask].copy()

            # drop columns from first merge
            third_merge_df.drop(columns = third_merge_df.iloc[:,number_of_columns:], inplace=True)
            
            # 3_plus merge (inner)
            third_merge_plus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, "year_plus"])
            # drop columns from first merge table that matched on 3_plus merge (inner)
            third_merge_plus_mask = check_df["title"].isin(list(third_merge_plus_df["title"]))
            check_df.drop(check_df[third_merge_plus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_plus_df])
            check_df.reset_index(drop=True, inplace=True)
            check_df.drop(columns = "year_x", inplace=True)

            # 3_minus merge (inner)
            third_merge_minus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, "year_minus"])
            # drop columns from first merge table that matched on 3_minus merge (inner)
            third_merge_minus_mask = check_df["title"].isin(list(third_merge_minus_df["title"]))
            check_df.drop(check_df[third_merge_minus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_minus_df])
            check_df.reset_index(drop=True, inplace=True)
            check_df.drop(columns = "year_x", inplace=True)

            counter += 1

        for year in list_years:
            # set-up third df for further calculations
            check_mask = check_df["tconst"].isnull()
            third_merge_df = check_df[check_mask].copy()

            # drop columns from first merge
            third_merge_df.drop(columns = third_merge_df.iloc[:,number_of_columns:], inplace=True)
            
            # merge again
            third_merge_plus_df = third_merge_df.merge(base_data, how="inner", left_on=["title_merge", "year"], right_on=[title, year])
            # drop columns from first merge table that matched on 3_plus merge (inner)
            third_merge_plus_mask = check_df["title"].isin(list(third_merge_plus_df["title"]))
            check_df.drop(check_df[third_merge_plus_mask].index, inplace=True)
            # add fitting rows from 3_plus merge (inner) to first table
            check_df = pd.concat([check_df, third_merge_plus_df])
            check_df.reset_index(drop=True, inplace=True)
        counter += 1

    # drop new year columns from final table
    check_df.drop(columns = check_df.iloc[:,-4:], inplace=True)

    # show unmatched rows
    check_mask = check_df["tconst"].isnull()
    display(check_df[check_mask])
    display(counter)
    
    return check_df

### Check if it works as expected

In [23]:
eu_check = ultimate_merge_func(eu_df, imdb_aka_df)

Unnamed: 0,title,producing_country,year,tickets_sold_since_1996,tickets_sold,title_merge,tconst,primary_title,original_title,runtime,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
21,15 Minutes (Fifteen Minutes),US,2001.0,2600419,2565397,onefiveminutesfifteenminutes,,,,,...,,,,,,,,,,
65,5X2 cinq fois deux,FR,2004.0,1150178,814942,fivextwocinqfoisdeux,,,,,...,,,,,,,,,,
293,Arthur et la guerre des deux mondes,FR,2010.0,3838378,3363498,arthuretlaguerredesdeuxmondes,,,,,...,,,,,,,,,,
296,Artificial Intelligence: AI,US,2001.0,8073605,8041431,artificialintelligenceai,,,,,...,,,,,,,,,,
320,Atatürk 1881 - 1919,TR,2023.0,1732649,1732649,ataturkoneeighteightoneonenineonenine,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3997,Tinker Bell and the Pirate Fairy,US,2014.0,5489166,5472379,tinkerbellandthepiratefairy,,,,,...,,,,,,,,,,
4193,Wallace & Gromit in The Curse of the Were-Rabbit,"GBinc, US",2005.0,14014825,13251997,wallacegromitinthecurseofthewererabbit,,,,,...,,,,,,,,,,
4204,Warum Männer nicht zuhören und Frauen schlecht...,DE,2007.0,1452342,1068475,warummannernichtzuhorenundfrauenschlechtereinp...,,,,,...,,,,,,,,,,
4322,Zeny v behu,CZ,2019.0,1705959,1675569,zenyfivebehu,,,,,...,,,,,,,,,,


13

In [24]:
na_check = ultimate_merge_func(na_df, imdb_aka_df, number_of_columns=7, short=True)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
31,21 and Over,2013-03-01,Relativity,25682380,3158964,2013.0,twooneandover,,,,...,,,,,,,,,,
54,63 Up,2019-11-27,BritBox,183940,20037,2019.0,sixthreeup,,,,...,,,,,,,,,,
69,A Common Thread,2002-11-29,Odeon Films,5058187,838836,2002.0,acommonthread,,,,...,,,,,,,,,,
107,A Rescue of Little Eggs,2021-08-27,Lionsgate,927154,91166,2021.0,arescueoflittleegg,,,,...,,,,,,,,,,
114,A Stir of Echoes,1999-09-10,Artisan,21133087,4160056,1999.0,astirofechoes,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3970,Y Tu Mama Tambien (And Your…,2002-03-15,IFC Films,13649881,2349377,2002.0,ytumamatambienandy,,,,...,,,,,,,,,,
3983,You're Next,2013-08-23,Lionsgate,18494006,2274785,2013.0,yourenext,,,,...,,,,,,,,,,
3989,"Yours, Mine and Ours",2005-11-23,Paramount Pictures,50733384,7914724,2005.0,yoursmineandours,,,,...,,,,,,,,,,
3991,Yu-Gi-Oh,2004-08-13,Warner Bros.,19762690,3182397,2004.0,yugioh,,,,...,,,,,,,,,,


13

In [25]:
na_upper_df = na_df.sort_values(by='release_date').iloc[:2250].copy()
na_upper_df

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge
1224,Firestorm,1998-01-09,20th Century Fox,8123860,1732166,1998,firestorm
1470,Hard Rain,1998-01-16,Paramount Pictures,19870567,4236794,1998,hard rain
1164,Fallen,1998-01-16,Warner Bros.,25310938,5396788,1998,fallen
1441,Half Baked,1998-01-16,Universal,17394881,3708929,1998,half baked
3086,Star Kid,1998-01-16,Trimark,7015240,1495786,1998,star kid
...,...,...,...,...,...,...,...
4041,The Town,2010-09-17,Warner Bros.,92173235,11682286,2010,the town
1061,Easy A,2010-09-17,Sony Pictures,58401464,7401960,2010,easy a
4510,You Will Meet a Tall Dark S…,2010-09-22,Sony Pictures Cla…,3229586,409326,2010,you will meet a tall dark s
4505,You Again,2010-09-24,Walt Disney,25702053,3257547,2010,you again


In [26]:
na_lower_df = na_df.sort_values(by="release_date").iloc[2250:].copy()
na_lower_df

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge
1961,Legend of the Guardians: Th…,2010-09-24,Warner Bros.,55549823,7040535,2010,legend of the guardians th
4342,Waiting for Superman,2010-09-24,Paramount Vantage,6417135,813325,2010,waiting for superman
675,Case 39,2010-10-01,Paramount Vantage,13261851,1680843,2010,case threenine
2769,Robot,2010-10-01,B4U Movies,2276427,288520,2010,robot
1971,Let Me In,2010-10-01,Overture Films,12134935,1538015,2010,let me in
...,...,...,...,...,...,...,...
307,Anyone But You,2023-12-22,Sony Pictures,24837385,2304025,2023,anyone but you
2196,Migration,2023-12-22,Universal,54103955,5018919,2023,migration
3379,The Color Purple,2023-12-25,Warner Bros.,44047642,4086052,2023,the color purple
1197,Ferrari,2023-12-25,Neon,10778480,999858,2023,ferrari


In [27]:
na_upper_test = ultimate_merge_func(na_upper_df, imdb_aka_df, number_of_columns=7, short=True)
# na_lower_test = ultimate_merge_func(na_lower_df, imdb_aka_df, number_of_columns=7, short=True)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
22,Everest,1998-03-06,MacGillivray Free…,84941548,17815503,1998.0,everest,,,,...,,,,,,,,,,
30,Grease,1998-03-27,Paramount Pictures,28350476,6044877,1998.0,grease,,,,...,,,,,,,,,,
50,Africa's Elephant Kingdom,1998-05-08,Discovery/IMAX,11168939,2266045,1998.0,africaselephantkin,,,,...,,,,,,,,,,
70,The X Files: Fight the Future,1998-06-19,20th Century Fox,83898313,17888766,1998.0,thetenfilesfightth,,,,...,,,,,,,,,,
81,Mysteries of Egypt,1998-07-16,Sony Pictures,40382986,7798368,1998.0,mysteriesofegypt,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999,Sea Rex 3D,2010-05-28,3D Entertainment,4465060,563059,2010.0,searexthreed,,,,...,,,,,,,,,,
2000,Prince of Persia: Sands of …,2010-05-28,Walt Disney,90759676,11503127,2010.0,princeofpersiasand,,,,...,,,,,,,,,,
2013,The Twilight Saga: Twilight…,2010-06-29,Summit Entertainment,2385237,302311,2010.0,thetwilightsagatwi,,,,...,,,,,,,,,,
2044,Alpha and Omega 3D,2010-09-17,Lionsgate,25107267,3182163,2010.0,alphaandomegathree,,,,...,,,,,,,,,,


13

In [28]:
pd.set_option('display.max_columns', 20)

In [29]:
na_upper_test[na_upper_test['tconst'].isnull()].head(11)

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge,tconst,primary_title,original_title,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
22,Everest,1998-03-06,MacGillivray Free…,84941548,17815503,1998.0,everest,,,,...,,,,,,,,,,
30,Grease,1998-03-27,Paramount Pictures,28350476,6044877,1998.0,grease,,,,...,,,,,,,,,,
50,Africa's Elephant Kingdom,1998-05-08,Discovery/IMAX,11168939,2266045,1998.0,africaselephantkin,,,,...,,,,,,,,,,
70,The X Files: Fight the Future,1998-06-19,20th Century Fox,83898313,17888766,1998.0,thetenfilesfightth,,,,...,,,,,,,,,,
81,Mysteries of Egypt,1998-07-16,Sony Pictures,40382986,7798368,1998.0,mysteriesofegypt,,,,...,,,,,,,,,,
88,Governess,1998-07-31,Sony Pictures Cla…,3794031,808961,1998.0,governess,,,,...,,,,,,,,,,
90,Halloween: H2O,1998-08-05,Miramax,55041738,11735978,1998.0,halloweenhtwoo,,,,...,,,,,,,,,,
115,The Imposters,1998-10-02,Fox Searchlight,2194875,467990,1998.0,theimposters,,,,...,,,,,,,,,,
147,Jerry Springer: Ringmaster,1998-11-25,Artisan,8761922,1868213,1998.0,jerryspringerringm,,,,...,,,,,,,,,,
182,Encounter in the Third Dime…,1999-02-26,nWave Pictures,4269548,840462,1999.0,encounterinthethir,,,,...,,,,,,,,,,


In [30]:
basics_df = pd.read_csv('Data/title.basics.csv')

In [31]:
basics_df[basics_df['tconst'] == 'tt1529567']

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
3852193,tt1529567,short,Sea Rex 3D: Journey to a Prehistoric World,Sea Rex 3D: Journey to a Prehistoric World,0.0,2010.0,\N,41,"Documentary,History,Short"


In [32]:
imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0186589"]

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,CA,DE,ES,FR,GB,IT,NL,PL,TR,ALTER
5235,tt0186589,Sugar & Spice,Sugar & Spice,2001,81,15519.0,5.7,2.0,Comedy,Crime,...,pomme et cannelle,,ingenuas y peligrosas,bad girls,sugar spice,le insolite sospette sugar spice,,slodkie i ostre,,


### Dropping movies that doesn't match

In [33]:
na_upper_df.drop(na_upper_df[na_upper_df['title'] == "Sea Rex 3D"].index, inplace=True)
na_upper_df

Unnamed: 0,title,release_date,distributor,gross_sales,tickets_sold,year,title_merge
1224,Firestorm,1998-01-09,20th Century Fox,8123860,1732166,1998,firestorm
1470,Hard Rain,1998-01-16,Paramount Pictures,19870567,4236794,1998,hard rain
1164,Fallen,1998-01-16,Warner Bros.,25310938,5396788,1998,fallen
1441,Half Baked,1998-01-16,Universal,17394881,3708929,1998,half baked
3086,Star Kid,1998-01-16,Trimark,7015240,1495786,1998,star kid
...,...,...,...,...,...,...,...
4041,The Town,2010-09-17,Warner Bros.,92173235,11682286,2010,the town
1061,Easy A,2010-09-17,Sony Pictures,58401464,7401960,2010,easy a
4510,You Will Meet a Tall Dark S…,2010-09-22,Sony Pictures Cla…,3229586,409326,2010,you will meet a tall dark s
4505,You Again,2010-09-24,Walt Disney,25702053,3257547,2010,you again


In [36]:
mask = na_upper_df[na_upper_df['year'] == 2004]
mask['distributor'].unique()

array(['Miramax', 'Warner Bros.', 'Universal', 'Walt Disney',
       'IDP Distribution', 'Zeitgeist', 'New Line', 'Dreamworks SKG',
       'IFC Films', 'Paramount Pictures', 'Newmarket Films',
       'Sony Pictures', 'Captured Light', '20th Century Fox', 'MGM',
       'Fox Searchlight', 'Sony Pictures Cla…', 'Lionsgate',
       'Focus Features', 'Televisa Cine', 'Warner Independent',
       'Alliance Atlantis', 'Benji Returns', 'Focus/Rogue Pictures',
       'Magnolia Pictures', 'Yash Raj Films', 'ThinkFilm',
       'Miramax/Dimension'], dtype=object)

### Changing release year

In [447]:
na_upper_df.loc[na_upper_df["title"] == "Oceans", "year"] = 2008

### Changing the title of movies so it fits

In [468]:
imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0186589", "primary_title_merge"].values[0]

'sugar spice'

In [469]:
na_upper_df.loc[na_upper_df["title"] == "Sugar and Spice", "title_merge"] = imdb_aka_df.loc[imdb_aka_df["tconst"] == "tt0186589", "primary_title_merge"].values[0]

### Uploading to SQL

In [473]:
from sql_functions import get_engine, schema
import psycopg2
engine = get_engine()
schema = schema()
table_name = 'na_merged_upper'

In [474]:
# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        na_upper_test.to_sql(table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The na_merged_upper table was imported successfully.
