In [1]:
import pandas as pd
import numpy as np

### Ticket sales in Europe

In [2]:
df_admissions_EU = pd.read_csv('./data/Movie_numbers_1996-2023_EU.csv')

In [None]:
df_admissions_EU.isnull().sum()

In [None]:
df_admissions_EU.info()

In [None]:
df_admissions_EU.head(20)

In [6]:
df_admissions_EU.rename(columns={'Admissions 1996-1996' : 'tickets_sold',
                                 'Total EU27+GB since 1996' : 'tickets_sold_since_1996',
                                 'Original title' : 'title',
                                 'Production year' : 'year'}, inplace=True)
df_admissions_EU.columns = df_admissions_EU.columns.str.lower().str.replace(' ', '_')
df_admissions_EU.drop(columns=['directors'], inplace=True)

In [7]:
df_admissions_EU['tickets_sold'] = df_admissions_EU['tickets_sold'].str.replace(' ', '')
df_admissions_EU['tickets_sold_since_1996'] = df_admissions_EU['tickets_sold_since_1996'].str.replace(' ', '')

In [8]:
df_admissions_EU['tickets_sold'] = pd.to_numeric(df_admissions_EU['tickets_sold'])
df_admissions_EU['tickets_sold_since_1996'] = pd.to_numeric(df_admissions_EU['tickets_sold_since_1996'])

In [None]:
df_admissions_EU.shape

In [None]:
df_admissions_EU['title'].nunique()

In [None]:
df_admissions_EU_clean = df_admissions_EU.groupby(['title', 'producing_country', 'year', 'tickets_sold_since_1996']).sum().reset_index()
df_admissions_EU_clean

In [None]:
display(df_admissions_EU_clean[df_admissions_EU_clean['title'] == 'Avatar'])
display(df_admissions_EU[df_admissions_EU['title'] == 'Avatar'])

In [None]:
df_admissions_EU_clean.head(20)

### Ticket sales in North America


In [None]:
df_admissions_NA = pd.read_csv('./Data/Movie_numbers_1996-2023_USA.csv')
df_admissions_NA.head(20)

In [None]:
df_admissions_NA.info()

In [16]:
df_admissions_NA['Release\nDate'] = pd.to_datetime(df_admissions_NA['Release\nDate'])
df_admissions_NA['Tickets Sold'] = df_admissions_NA['Tickets Sold'].str.replace(',', '')
df_admissions_NA['Tickets Sold'] = pd.to_numeric(df_admissions_NA['Tickets Sold'])

In [None]:
df_admissions_NA['1996 Gross'] = df_admissions_NA['1996 Gross'].str.replace(',', '')
df_admissions_NA['1996 Gross'] = df_admissions_NA['1996 Gross'].str.replace('$', '')

In [18]:
df_admissions_NA['1996 Gross'] = pd.to_numeric(df_admissions_NA['1996 Gross'])

In [None]:
df_admissions_NA.rename(columns={'Release\nDate' : 'release_date',
                                 'Movie' : 'title',
                                 '1996 Gross' : 'Gross_Sales',
                                 'Tickets Sold' : 'tickets_sold'}, inplace=True)
df_admissions_NA.columns = df_admissions_NA.columns.str.lower()
df_admissions_NA.columns

In [None]:
df_admissions_NA['title'].nunique()

In [None]:
df_admissions_NA_clean = df_admissions_NA.groupby(['title', 'release_date', 'distributor']).sum().reset_index()
df_admissions_NA_clean.drop(columns=['rank', 'year'], inplace=True)
df_admissions_NA_clean.shape

In [None]:
display(df_admissions_NA_clean[df_admissions_NA_clean['title'] == 'Avatar'])
display(df_admissions_NA[df_admissions_NA['title'] == 'Avatar'])

In [None]:
df_admissions_NA_clean.groupby(['title']).size().sort_values(ascending=False)

In [None]:
df_admissions_NA_clean[df_admissions_NA_clean['title'] == 'Halloween']

In [None]:
display(df_admissions_NA_clean.head())
display(df_admissions_EU_clean.head())

### IMDB DATA

In [26]:
df_movie_list_years = pd.read_csv('./Data/movie_list_years.csv')
df_akas = pd.read_csv('./Data/title.akas.csv')
df_basics = pd.read_csv('./Data/title.basics.csv')
df_crew = pd.read_csv('./Data/title.crew.csv')
df_principals = pd.read_csv('./Data/title.principals.csv')
df_ratings = pd.read_csv('./Data/title.ratings.csv')
df_names = pd.read_csv('./Data/name.basics.csv')

In [27]:
df_movie_list_years.replace('\\N', np.nan, inplace=True)
df_akas.replace('\\N', np.nan, inplace=True)
df_basics.replace('\\N', np.nan, inplace=True)
df_crew.replace('\\N', np.nan, inplace=True)
df_principals.replace('\\N', np.nan, inplace=True)
df_ratings.replace('\\N', np.nan, inplace=True)
df_names.replace('\\N', np.nan, inplace=True)

In [None]:
display(df_ratings.head(1))

In [29]:
df_crew_named = df_crew.merge(df_names, how='left', left_on='directors', right_on='nconst')
df_crew_named.drop(columns=['nconst', 'birthYear', 'deathYear', 'primaryProfession'], inplace=True)

In [30]:
df_crew_named.rename(columns={'primaryName' : 'directors_name',
                              'knownForTitles' : 'known_for_directing'}, inplace=True)

In [None]:
df_crew_named['writers'].str.count(',').sort_values(ascending=False)

In [None]:
df_crew_named.iloc[427635]

### IMDB Final / Merging

In [33]:
df_imdb_final = df_movie_list_years.merge(df_crew, how='left', on='tconst')

In [None]:
df_imdb_final.shape

In [None]:
df_imdb_final['directors'].str.count(',').nunique()

In [None]:
df_imdb_final['directors_count'] = df_imdb_final['directors'].str.count(',') + 1
df_imdb_final['directors_count'].value_counts()/len(df_imdb_final) * 100

In [37]:
df_imdb_final[['director', 'director2', 'director3']] = df_imdb_final['directors'].str.split(',', n=2, expand=True)
df_imdb_final.drop(columns='directors', inplace=True)

In [None]:
df_imdb_final['writers_count'] = df_imdb_final['writers'].str.count(',') + 1
df_imdb_final['writers_count'].value_counts()/len(df_imdb_final[df_imdb_final['writers_count'].notnull()]) * 100

In [39]:
df_imdb_final[['writer', 'writer2', 'writer3']] = df_imdb_final['writers'].str.split(',', n=2, expand=True)
df_imdb_final.drop(columns='writers', inplace=True)

In [None]:
df_imdb_final

### Set option

In [41]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', 10)

In [42]:
df_imdb_final = df_imdb_final.merge(df_names[['nconst', 'primaryName']], how='left', left_on='director', right_on='nconst')
df_imdb_final.drop(columns='nconst', inplace=True)
df_imdb_final.rename(columns={'primaryName' : 'director_name'}, inplace=True)
df_imdb_final = df_imdb_final.merge(df_names[['nconst', 'primaryName']], how='left', left_on='director2', right_on='nconst')
df_imdb_final.drop(columns='nconst', inplace=True)
df_imdb_final.rename(columns={'primaryName' : 'director2_name'}, inplace=True)
df_imdb_final = df_imdb_final.merge(df_names[['nconst', 'primaryName']], how='left', left_on='director3', right_on='nconst')
df_imdb_final.drop(columns='nconst', inplace=True)
df_imdb_final.rename(columns={'primaryName' : 'director3_name'}, inplace=True)

In [43]:
df_imdb_final = df_imdb_final.merge(df_names[['nconst', 'primaryName']], how='left', left_on='writer', right_on='nconst')
df_imdb_final.drop(columns='nconst', inplace=True)
df_imdb_final.rename(columns={'primaryName' : 'writer_name'}, inplace=True)
df_imdb_final = df_imdb_final.merge(df_names[['nconst', 'primaryName']], how='left', left_on='writer2', right_on='nconst')
df_imdb_final.drop(columns='nconst', inplace=True)
df_imdb_final.rename(columns={'primaryName' : 'writer2_name'}, inplace=True)
df_imdb_final = df_imdb_final.merge(df_names[['nconst', 'primaryName']], how='left', left_on='writer3', right_on='nconst')
df_imdb_final.drop(columns='nconst', inplace=True)
df_imdb_final.rename(columns={'primaryName' : 'writer3_name'}, inplace=True)

In [None]:
df_imdb_final

### Principals

In [None]:
df_principals['category'].isin(['actor', 'actress'])

In [None]:
df_principals['ordering'].unique()

In [None]:
df_principals['tconst'].nunique()

In [48]:
# df_principals[df_principals['category'].isin(['actor', 'actress'])].groupby('ordering').size() / (df_principals['tconst'].nunique()) * 100

In [49]:
# df_principals[df_principals['category'].isin(['actor', 'actress'])].groupby('ordering').size() / (df_principals.loc[df_principals['category'].isin(['actor', 'actress']), ['tconst']].nunique()[0]) * 100

In [50]:
df_principals_filtered = df_principals.copy()

In [51]:
df_principals_filtered = df_principals_filtered[df_principals_filtered['category'].isin(['actor', 'actress'])]

In [None]:
df_principals_filtered['category'].unique()

In [53]:
df_principals_filtered = df_principals_filtered[df_principals_filtered['ordering'] <= 5]

In [None]:
df_principals_filtered['ordering'].unique()

In [None]:
df_principals_filtered['tconst'].nunique()

In [56]:
df_principals_filtered = df_principals_filtered.merge(df_names[['nconst', 'primaryName']], how='left', on='nconst')

In [57]:
df_principals_filtered.rename(columns={'primaryName' : 'a_name'}, inplace=True)

In [58]:
df_principals_filtered = df_principals_filtered.pivot(index='tconst', columns='ordering', values='a_name')

In [59]:
df_principals_filtered = df_principals_filtered.reset_index()

In [None]:
df_principals_filtered.rename_axis(None)

In [61]:
df_principals_filtered.rename_axis('', axis=1, inplace=True)

In [62]:
df_principals_filtered.rename(columns={1 : 'acting',
                                       2 : 'acting2',
                                       3 : 'acting3',
                                       4 : 'acting4',
                                       5 : 'acting5'}, inplace=True)

In [None]:
df_principals_filtered

In [64]:
df_imdb_final = df_imdb_final.merge(df_principals_filtered, how='left', on='tconst')

In [65]:
df_akas.rename(columns={'titleId' : 'tconst'}, inplace=True)

### Ratings

In [66]:
df_imdb_final = df_imdb_final.merge(df_ratings, how='left', on='tconst')

In [None]:
df_imdb_final

### Filtering

In [68]:
# Filtering for years
df_imdb_final = df_imdb_final[(df_imdb_final['startYear'] >= 1997) & (df_imdb_final['startYear'] <= 2023)]

In [None]:
df_imdb_final[df_imdb_final['tconst'] == 'tt7846844']

In [70]:
df_imdb_final = df_imdb_final[df_imdb_final['titleType'] == 'movie']

In [71]:
df_imdb_final.drop(columns=['director', 'director2', 'director3', 'writer', 'writer2', 'writer3'], inplace=True)

In [72]:
df_imdb_final.rename(columns={'titleType' : 'title_type',
                              'primaryTitle' : 'primary_title',
                              'originalTitle' : 'original_title',
                              'isAdult' : 'is_adult',
                              'startYear' : 'year',
                              'endYear' : 'end_year',
                              'runtimeMinutes' : 'runtime',
                              'averageRating' : 'average_rating',
                              'numVotes' : 'num_votes'}, inplace=True)

In [73]:
df_imdb_final.drop(columns=['end_year'], inplace=True)

In [None]:
df_imdb_final[df_imdb_final['average_rating'].notnull()].count()

In [None]:
df_imdb_final.isnull().sum()

In [76]:
df_imdb_final.to_csv('./Data/df_imdb_final.csv')

In [None]:
df_imdb_final.shape

### Genre cleaning

In [None]:
df_imdb_final[df_imdb_final['genres'].str.count(',') == 2.0]

In [79]:
df_imdb_final['genres_count'] = df_imdb_final['genres'].str.count(',') + 1

In [None]:
df_imdb_final.groupby('genres_count').size() / (len(df_imdb_final)) * 100

In [81]:
df_imdb_final[['genre', 'genre2', 'genre3']] = df_imdb_final['genres'].str.split(',', n=2, expand=True)

In [82]:
df_imdb_final.drop(columns='genres', inplace=True)

In [None]:
df_imdb_final[df_imdb_final['genre'].isin(['Talk-Show', 'Reality-TV', 'Game-Show'])]

In [None]:
df_imdb_final.head()

### Dropping NaN Ratings

In [None]:
df_imdb_final[df_imdb_final['average_rating'].isnull()]

In [86]:
df_imdb_final = df_imdb_final[df_imdb_final['average_rating'].notnull()]

### WIEDER MIT GENRES BESCHAEFTIGEN

In [None]:
df_imdb_final['genre'].unique()

In [88]:
df_imdb_final = df_imdb_final[~df_imdb_final['genre'].isin(['Talk-Show', 'Reality-TV'])]
df_imdb_final = df_imdb_final[~df_imdb_final['genre2'].isin(['Talk-Show', 'Reality-TV'])]
df_imdb_final = df_imdb_final[~df_imdb_final['genre3'].isin(['Talk-Show', 'Reality-TV', 'Game-Show'])]

In [None]:
df_imdb_final[df_imdb_final['genre'] == 'News']

In [None]:
df_imdb_final[df_imdb_final['genre2'] == 'News']

In [None]:
df_imdb_final[df_imdb_final['genre3'] == 'News']

In [None]:
df_imdb_final['genre2'].unique()

In [None]:
df_imdb_final['genre3'].unique()

In [94]:
pd.options.mode.copy_on_write = True

In [95]:
df_imdb_final.replace({None : np.nan}, inplace=True)

In [None]:
df_imdb_final[df_imdb_final['genre'].isnull()]

### Votes

In [None]:
df_imdb_final[df_imdb_final['num_votes'] <= 5]

In [None]:
df_imdb_final.columns

In [99]:
df_imdb_final.drop(columns='title_type', inplace=True)

In [100]:
df_imdb_final = df_imdb_final[['tconst', 'primary_title', 'original_title', 'year', 'runtime', 'num_votes', 'average_rating', 'genres_count', 'genre', 'genre2', 'genre3', 'acting', 'acting2', 'acting3', 'acting4', 'acting5', 
              'directors_count', 'director_name', 'director2_name', 'director3_name', 'writers_count', 'writer_name', 'writer2_name', 'writer3_name', 'is_adult']] 

In [None]:
df_imdb_final

In [None]:
display(df_admissions_EU_clean['title'].nunique())
display(df_admissions_NA_clean.shape)

In [None]:
df_admissions_NA_clean['distributor'].nunique()

### Handling title dublicates after merging
- left merged IMDB with EU & NA on 'title' and 'year'
- new df had 4 additional rows coming from InStAnCeS(?) in which in EU or NA title and year doubled for different movies (Paparazzi (FR & IT)) 
- additionally 168 duplicates looking at IMDB movies that had box office matches but the same title in the same year
- next step: checking duplicates by hand and dropping unwanted matches

In [None]:
df_imdb_final[df_imdb_final[['primary_title', 'year']].duplicated(keep=False)].sort_values(by='primary_title')

### Changing years to fit NA/EU data

In [105]:
df_imdb_final.loc[df_imdb_final['tconst'] == 'tt4776998', 'year'] = 2017

In [None]:
df_imdb_final.loc[df_imdb_final['tconst'] == 'tt4776998', 'year']

In [107]:
df_admissions_NA_clean['release_year'] = df_admissions_NA_clean['release_date'].dt.year
duplicates = df_imdb_final.merge(df_admissions_NA_clean, how='left', left_on=['primary_title', 'year'], right_on=['title', 'release_year']).merge(df_admissions_EU_clean, how='left', left_on=['primary_title', 'year'], right_on=['title', 'year'])

In [None]:
duplicates.shape

In [109]:
mask = ((~duplicates['tickets_sold_x'].isnull()) | (~duplicates['tickets_sold_y'].isnull())) & (duplicates[['primary_title', 'year']].duplicated(keep=False))

### Dropping duplicated movies

In [110]:
films_to_drop = ['tt7596520', 'tt6869948', 'tt3114786', 'tt10381532', 'tt15302566', 'tt8878922', 'tt2159988', 'tt3833746', 'tt7232438', 'tt4192918', 'tt6353886',
                 'tt0989719', 'tt0384344', 'tt5317732', 'tt27834173', 'tt11719504', 'tt0356736', 'tt0462326', 'tt7535780', 'tt4982356', 'tt4155318', 'tt4935746',
                 'tt2943946', 'tt10530176', 'tt1472211', 'tt0326506', 'tt1825978', 'tt3660078', 'tt5211596', 'tt5738152', 'tt7785302', 'tt2660118', 'tt3315656',
                 'tt6147768', 'tt3604256', 'tt9278312', 'tt0211634', 'tt15751968', 'tt22982202', 'tt21850902', 'tt8372368', 'tt7762424', 'tt1130090', 'tt2514592',
                 'tt13830296', 'tt29979669', 'tt1826727', 'tt6184774', 'tt1537401', 'tt15342012', 'tt21158466', 'tt0465552', 'tt2558484', 'tt5884960', 'tt3985956',
                 'tt0238251', 'tt2230954', 'tt21379234', 'tt1613084', 'tt20877624', 'tt22037488', 'tt9204088', 'tt19880966', 'tt5897002', 'tt1327709', 'tt7002100',
                 'tt21254598', 'tt5089556', 'tt0372361', 'tt1586713', 'tt11006500', 'tt4317898', 'tt1945037', 'tt14661820', 'tt8706138', 'tt14220888', 'tt26342791',
                 'tt0160395', 'tt10192566', 'tt10332854', 'tt2049386', 'tt26505281', 'tt19512922', 'tt2447982', 'tt15119398']

In [None]:
duplicates[duplicates['tconst'].isin(films_to_drop)]

### Dropping list of duplicates from both Dataframes

In [112]:
duplicates.drop(duplicates[duplicates['tconst'].isin(films_to_drop)].index, inplace=True)

In [113]:
df_imdb_final.drop(df_imdb_final[df_imdb_final['tconst'].isin(films_to_drop)].index, inplace=True)

### Dropping the 2 remaning duplicated movies

In [None]:
df_imdb_final[df_imdb_final['primary_title'] == 'Paparazzi']

In [115]:
df_imdb_final.loc[df_imdb_final['tconst'] == 'tt0133314', 'primary_title'] = 'Paparazzi (FR)'
df_imdb_final.loc[df_imdb_final['tconst'] == 'tt0174105', 'primary_title'] = 'Paparazzi (IT)'

In [None]:
df_imdb_final.loc[df_imdb_final['tconst'] == 'tt0133314', 'primary_title']

In [None]:
df_imdb_final[df_imdb_final['primary_title'] == 'Beast']

In [118]:
df_imdb_final.loc[df_imdb_final['tconst'] == 'tt11301946', 'primary_title'] = 'Beast (IN)'
df_imdb_final.loc[df_imdb_final['tconst'] == 'tt13223398', 'primary_title'] = 'Beast (US)'

In [None]:
df_admissions_EU_clean[df_admissions_EU_clean['title'] == 'Paparazzi']

In [120]:
df_admissions_EU_clean.loc[df_admissions_EU_clean['title'] == 'Paparazzi', 'title'] = ['Paparazzi (FR)', 'Paparazzi (IT)']

In [None]:
df_admissions_NA_clean[df_admissions_NA_clean['title'] == 'Paparazzi']

In [None]:
df_admissions_EU_clean[df_admissions_EU_clean['title'] == 'Beast']

In [123]:
df_admissions_EU_clean.loc[df_admissions_EU_clean['title'] == 'Beast', 'title'] = 'Beast (US)'

In [None]:
df_admissions_NA_clean[df_admissions_NA_clean['title'] == 'Beast']

In [125]:
df_admissions_NA_clean.loc[df_admissions_NA_clean['title'] == 'Beast', 'title'] = ['Beast (IN)', 'Beast (US)']

In [None]:
df_imdb_final[df_imdb_final['primary_title'] == 'The Visit']

In [None]:
duplicates[duplicates['primary_title'] == 'The Visit']

In [128]:
df_no_data_movie = df_admissions_EU_clean.merge(df_imdb_final, how='left', left_on=['title', 'year'], right_on=['original_title', 'year'])

In [None]:
df_no_data_movie[df_no_data_movie['tconst'].isnull()]

In [None]:
df_imdb_final[df_imdb_final['tconst'] == 'tt1640202']

### Uploading to SQL

In [131]:
from sql_functions import get_engine, schema
import psycopg2
engine = get_engine()
schema = schema()
table_name = 'movie_data_EU'
table_name2 = 'movie_data_NA'
table_name3 = 'IMDB_data'

In [None]:
# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        df_admissions_EU_clean.to_sql(table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

if engine!=None:
    try:
        df_admissions_NA_clean.to_sql(table_name2, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name2} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

if engine!=None:
    try:
        df_imdb_final.to_sql(table_name3, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name3} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None