# Combine Box_Office

Currently, we have one value for each country. However, our "worldwide" column does not contain all movies. 
We need to calculate a correct sum of all values we have.

But first, let's import the box_office_data with inflated values

Import table

In [157]:
# import all packages
import pandas as pd
import numpy as np
import psycopg2 as psycopg2
import sql_functions as sqlf

pd.options.display.float_format= "{:_.0f}".format

In [214]:
pd.set_option("display.max.columns", 300)

In [None]:
schema = "capstone_24_4_group1"
display(schema)
box_query = f'''   SELECT *
                    FROM {schema}."box_office_data_inflated"
                    '''
box_df = sqlf.get_dataframe(box_query)

In [None]:
box_df.head()

Correct Column Names

In [183]:
heading_list = box_df.columns.values
for index, heading in enumerate(heading_list):
    heading = heading.replace("(", "")
    heading = heading.replace("\'", "")
    heading = heading.replace(",", "")
    heading = heading.replace(")", "").strip()
    heading_list[index] = heading

In [None]:
box_df.columns = heading_list
box_df.head()

### Create Sum for inflated values

In [None]:
inflated_filter = (box_df.columns.str.startswith("inflated")) | (box_df.columns.str.startswith("tconst"))

calculation_df = box_df.loc[:, inflated_filter].copy()
calculation_df.head()

In [None]:
calculation_df.drop(columns=["inflated_values Worldwide", "inflated_values International"], inplace=True)
calculation_df

In [187]:
calculation_df["world_inflated_values"] = calculation_df.iloc[:,1:].sum(axis=1)

In [None]:
calculation_df

In [None]:
box_df_added = pd.merge(box_df, calculation_df[["tconst", "world_inflated_values"]], how="inner", on="tconst")
box_df_added

In [None]:
box_df.shape

### Create Sum for non_infalted values

In [None]:
inflated_filter = (box_df.columns.str.startswith("value")) | (box_df.columns.str.startswith("tconst"))

calculation_df = box_df.loc[:, inflated_filter].copy()
calculation_df.drop(columns=["value Worldwide", "value International"], inplace=True)
calculation_df

In [192]:
calculation_df["world_value"] = calculation_df.iloc[:,1:].sum(axis=1)

In [None]:
calculation_df

In [None]:
box_df_added = pd.merge(box_df_added, calculation_df[["tconst", "world_value"]], how="inner", on="tconst")
box_df_added

In [None]:
box_df.shape

### Compare calculated worldwide with IMDB worldwide

In [None]:
worldwide_comparison = box_df_added.loc[box_df_added["value Worldwide"].notnull(), ["tconst","value Worldwide", "world_value"]].sort_values(by="value Worldwide")
worldwide_comparison.columns = ["tconst","imdb_worldwide", "calculated_worldwide"]
worldwide_comparison

In [None]:
worldwide_comparison["difference"] = worldwide_comparison["imdb_worldwide"] - worldwide_comparison["calculated_worldwide"]
display(worldwide_comparison.sort_values(by="difference").head(6))
display(worldwide_comparison.sort_values(by="difference", ascending=False).head(50))

In [None]:
worldwide_comparison["difference_perc"] = (worldwide_comparison["difference"] / worldwide_comparison["calculated_worldwide"]) * 100
display(worldwide_comparison.sort_values(by="difference_perc", ascending=False).head(30))

In [198]:
import seaborn as sns

In [199]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12,8))

sns.histplot(data=worldwide_comparison, x= "difference", bins=50)



plt.show()

In [None]:
worldwide_comparison[worldwide_comparison["difference_perc"] <= 10]

### Plan:

- Whenever Calculated > IMDB (5 cases) = Keep calculated as is
- Whenever Calculated < IMDB (many cases) = Overwrite Calculated with IMDB
<br><br>
- After values are overwritten, drop IMDB and keep calculated only.

In [None]:
box_df_added.head()

In [None]:
# calculated difference of imdb worldwide - calculated worldwide
box_df_added["worldwide_difference"] = box_df_added["inflated_values Worldwide"] - box_df_added["world_inflated_values"]
box_df_added.head()

In [218]:
# filter for positive difference (IMDB > calc) and overwrite calculated with IMDB
positive_difference_mask = box_df_added["worldwide_difference"] > 0
box_df_added.loc[positive_difference_mask, "world_inflated_values"] = box_df_added.loc[positive_difference_mask, "inflated_values Worldwide"]

In [None]:
# calculate difference after the changes -> Expectation: Now it should be zero except for the 5 negative cases
box_df_added["worldwide_difference_after"] = box_df_added["inflated_values Worldwide"] - box_df_added["world_inflated_values"]
box_df_added.head()

In [None]:
# check assumption
box_df_added.loc[positive_difference_mask, "worldwide_difference_after"].sum()

In [224]:
# drop all unneccessary columns/ confusing columns
box_df_added.drop(columns=["worldwide_difference", "worldwide_difference_after", "inflated_values International", "inflated_values Worldwide"], inplace=True)

### Repeat for non_inflated values

In [None]:
# calculated difference of imdb worldwide - calculated worldwide
box_df_added["worldwide_difference"] = box_df_added["value Worldwide"] - box_df_added["world_value"]
box_df_added.head()

In [226]:
# filter for positive difference (IMDB > calc) and overwrite calculated with IMDB
positive_difference_mask = box_df_added["worldwide_difference"] > 0
box_df_added.loc[positive_difference_mask, "world_value"] = box_df_added.loc[positive_difference_mask, "value Worldwide"]

In [None]:
# calculate difference after the changes -> Expectation: Now it should be zero except for the 5 negative cases
box_df_added["worldwide_difference_after"] = box_df_added["value Worldwide"] - box_df_added["world_value"]
box_df_added.head()

In [None]:
# check assumption
box_df_added.loc[positive_difference_mask, "worldwide_difference_after"].sum()

In [229]:
# drop all unneccessary columns/ confusing columns
box_df_added.drop(columns=["worldwide_difference", "worldwide_difference_after", "value International", "value Worldwide"], inplace=True)

In [None]:
box_df_added

### DONE: Worldwide values for inflated and non-inflated columns are calculated and aligned with IMDB online Data



In [236]:
table_name = 'FINAL_box_office_data_inflated'

In [None]:
engine = sqlf.get_engine()
engine

In [None]:
# Write records stored in a dataframe to SQL database
if engine != None:
    try:
        box_df_added.to_sql(table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # your class schema
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print("shit")