# Goal
One Notebook that takes the raw BoxOffice data and transforms it until it could be uploaded and worked with.
-> Combining all the singular steps with followed in different notebooks. 

In [20]:
# import all packages
import pandas as pd
import numpy as np
import psycopg2 as psycopg2
import sql_functions as sqlf

pd.options.display.float_format= "{:_.0f}".format

In [21]:
schema = "capstone_24_4_group1"
display(schema)
box_query = f'''   SELECT *
                    FROM {schema}."raw_scraped_box_office"
                    '''
box_office_df = sqlf.get_dataframe(box_query)

'capstone_24_4_group1'

In [22]:
box_office_df

Unnamed: 0,tconst,region,value,release_group
0,tt0035423,Domestic,"$47,121,859",2
1,tt0035423,Germany,"$4,482,954",2
2,tt0035423,Italy,"$2,050,485",2
3,tt0035423,Spain,"$1,194,549",2
4,tt0035423,Czech Republic,"$732,897",2
...,...,...,...,...
155702,tt9908390,Ukraine,"$28,076",2
155703,tt9908390,Croatia,"$5,330",2
155704,tt9908390,Portugal,"$3,690",2
155705,tt9908390,Russia/CIS,"$32,389",2


In [23]:
# Transformation of colum [value]
box_office_df.loc[ : , "value"] = box_office_df["value"].str.replace("$", "")
box_office_df.loc[ : , "value"] = box_office_df["value"].str.replace(",", "")
box_office_df["value"] = box_office_df["value"].astype("Int64")

In [24]:
# Transformation of colum [release_group]
box_office_df["release_group"] = box_office_df["release_group"].astype("Int64")

In [25]:
box_office_df.dropna(inplace=True)

### Release Group Dropping

Drop all Domestic values for Release Group 0 and 1 because they have the sum of those values in releasegroup 2

In [26]:
mask_domestic_non_lifetime = ((box_office_df["release_group"] == 0) | (box_office_df["release_group"] == 1)) & (box_office_df["region"]=="Domestic")
box_office_df[mask_domestic_non_lifetime]

Unnamed: 0,tconst,region,value,release_group
23,tt0118589,Domestic,4274407,0
41,tt0118694,Domestic,2738980,0
64,tt0118715,Domestic,18718818,0
67,tt0118715,Domestic,582585,1
88,tt0118789,Domestic,2375097,0
...,...,...,...,...
153759,tt8991268,Domestic,815082,0
154149,tt9116358,Domestic,2529324,0
154653,tt9362722,Domestic,381311319,0
154656,tt9362722,Domestic,282435,1


In [27]:
box_office_df.drop(box_office_df[((box_office_df["release_group"] == 0) | (box_office_df["release_group"] == 1)) & (box_office_df["region"] == "Domestic")].index, inplace = True)

In [28]:
box_office_df.shape

(152256, 4)

Now sum all international/worldwide Release to have them as lifetime values

In [29]:
bx_df_new = box_office_df[(box_office_df["release_group"] == 0) | (box_office_df["release_group"] == 1)].groupby(["tconst", "region"]).sum()
bx_df_new["release_group"] = 2
bx_df_new.reset_index(inplace=True)

In [30]:
box_office_df = pd.concat([box_office_df,bx_df_new])
box_office_df.drop(box_office_df[(box_office_df["release_group"] == 0) | (box_office_df["release_group"] == 1)].index, inplace = True)
box_office_df.drop(columns="release_group", inplace = True)

In [31]:
box_office_df

Unnamed: 0,tconst,region,value
0,tt0035423,Domestic,47121859
1,tt0035423,Germany,4482954
2,tt0035423,Italy,2050485
3,tt0035423,Spain,1194549
4,tt0035423,Czech Republic,732897
...,...,...,...
2294,tt9116358,Worldwide,9869306
2295,tt9362722,International,309230984
2296,tt9362722,Worldwide,690824738
2297,tt9608818,International,131781


### Inflation Correction

Import IMDB TIckets data and merge year to box_office

In [32]:
imdb_tickets_query = f'''   SELECT *
                    FROM {schema}."IMDB_tickets_data"
                    '''
imdb_df = sqlf.get_dataframe(imdb_tickets_query)
imdb_df.head()

Unnamed: 0,tconst,primary_title,original_title,year,runtime,num_votes,average_rating,genres_count,genre,genre2,...,director3_name,writers_count,writer_name,writer2_name,writer3_name,EU_since_1996,EU_tickets_sold,distributor,NA_gross_sales,NA_tickets_sold
0,tt0035423,Kate & Leopold,Kate & Leopold,2001,118,89944,6,3,Comedy,Fantasy,...,,2,Steven Rogers,James Mangold,,2_497_656,2_481_644,Miramax,47_095_453,8_245_453
1,tt0117786,Mr. Nice Guy,Yat goh ho yan,1997,88,29154,6,3,Action,Adventure,...,,2,Fibe Ma,Edward Tang,,,,New Line,12_716_953,2_711_503
2,tt0118301,Dead Man on Campus,Dead Man on Campus,1998,96,16474,6,1,Comedy,,...,,4,Anthony Abrams,Adam Larson Broder,,,,Paramount Pictures,15_064_948,3_212_142
3,tt0118564,Affliction,Affliction,1997,114,19572,7,3,Drama,Mystery,...,,2,Russell Banks,Paul Schrader,,,,Lionsgate,6_238_175,1_227_987
4,tt0118589,Glitter,Glitter,2001,104,24170,2,3,Drama,Music,...,,2,Cheryl L. West,Kate Lanier,,,,20th Century Fox,4_273_372,755_012


In [33]:
# Merge sales AND year data to Boxoffice-table
box_office_df = box_office_df.merge(imdb_df[["tconst", "year", "NA_gross_sales"]], how = "inner", left_on = "tconst", right_on = "tconst")

# Resort columns
box_office_df = box_office_df.reindex(["tconst","region", "year", "value", "NA_gross_sales"], axis=1)

In [34]:
box_office_df

Unnamed: 0,tconst,region,year,value,NA_gross_sales
0,tt0035423,Domestic,2001,47121859,47_095_453
1,tt0035423,Germany,2001,4482954,47_095_453
2,tt0035423,Italy,2001,2050485,47_095_453
3,tt0035423,Spain,2001,1194549,47_095_453
4,tt0035423,Czech Republic,2001,732897,47_095_453
...,...,...,...,...,...
149162,tt9908390,Ukraine,2020,28076,
149163,tt9908390,Croatia,2020,5330,
149164,tt9908390,Portugal,2020,3690,
149165,tt9908390,Russia/CIS,2020,32389,


In [36]:
import cpi
#cpi.update()

  df = pd.read_csv(io.StringIO(response.text), sep="\t")
  df = pd.read_csv(io.StringIO(response.text), sep="\t")


In [37]:
# drop year 1997 because of missing values in inflation correction table
box_office_df.drop(box_office_df[box_office_df["year"] == 1997].index, inplace = True)

In [38]:
# inflation values for values
unique_years = box_office_df['year'].unique()
inflation_factors = {year: cpi.inflate(1, year, to=2023, items="Admission to movies, theaters, and concerts") for year in unique_years}

box_office_df['inflation_factor'] = box_office_df['year'].map(inflation_factors)

box_office_df['inflated_values'] = box_office_df['value'] * box_office_df['inflation_factor']

display(box_office_df)

Unnamed: 0,tconst,region,year,value,NA_gross_sales,inflation_factor,inflated_values
0,tt0035423,Domestic,2001,47121859,47_095_453,2,86_702_232
1,tt0035423,Germany,2001,4482954,47_095_453,2,8_248_446
2,tt0035423,Italy,2001,2050485,47_095_453,2,3_772_806
3,tt0035423,Spain,2001,1194549,47_095_453,2,2_197_920
4,tt0035423,Czech Republic,2001,732897,47_095_453,2,1_348_500
...,...,...,...,...,...,...,...
149162,tt9908390,Ukraine,2020,28076,,1,32_388
149163,tt9908390,Croatia,2020,5330,,1,6_149
149164,tt9908390,Portugal,2020,3690,,1,4_257
149165,tt9908390,Russia/CIS,2020,32389,,1,37_363


In [40]:
# inflation values for NA_Gross
unique_years = box_office_df['year'].unique()
inflation_factors = {year: cpi.inflate(1, year, to=2023, items="Admission to movies, theaters, and concerts") for year in unique_years}

box_office_df['NA_inflation_factor'] = box_office_df['year'].map(inflation_factors)

box_office_df['NA_inflated_values'] = box_office_df['NA_gross_sales'] * box_office_df['NA_inflation_factor']

display(box_office_df)

Unnamed: 0,tconst,region,year,value,NA_gross_sales,inflation_factor,inflated_values,NA_inflation_factor,NA_inflated_values
0,tt0035423,Domestic,2001,47121859,47_095_453,2,86_702_232,2,86_653_646
1,tt0035423,Germany,2001,4482954,47_095_453,2,8_248_446,2,86_653_646
2,tt0035423,Italy,2001,2050485,47_095_453,2,3_772_806,2,86_653_646
3,tt0035423,Spain,2001,1194549,47_095_453,2,2_197_920,2,86_653_646
4,tt0035423,Czech Republic,2001,732897,47_095_453,2,1_348_500,2,86_653_646
...,...,...,...,...,...,...,...,...,...
149162,tt9908390,Ukraine,2020,28076,,1,32_388,1,
149163,tt9908390,Croatia,2020,5330,,1,6_149,1,
149164,tt9908390,Portugal,2020,3690,,1,4_257,1,
149165,tt9908390,Russia/CIS,2020,32389,,1,37_363,1,


In [42]:
box_office_df.drop(columns=['inflation_factor', 'NA_inflation_factor'], inplace=True)
box_office_df

Unnamed: 0,tconst,region,year,value,NA_gross_sales,inflated_values,NA_inflated_values
0,tt0035423,Domestic,2001,47121859,47_095_453,86_702_232,86_653_646
1,tt0035423,Germany,2001,4482954,47_095_453,8_248_446,86_653_646
2,tt0035423,Italy,2001,2050485,47_095_453,3_772_806,86_653_646
3,tt0035423,Spain,2001,1194549,47_095_453,2_197_920,86_653_646
4,tt0035423,Czech Republic,2001,732897,47_095_453,1_348_500,86_653_646
...,...,...,...,...,...,...,...
149162,tt9908390,Ukraine,2020,28076,,32_388,
149163,tt9908390,Croatia,2020,5330,,6_149,
149164,tt9908390,Portugal,2020,3690,,4_257,
149165,tt9908390,Russia/CIS,2020,32389,,37_363,


### Ok now we pivot, correct worldwide values and melt again -.-

In [43]:
box_office_pivot_df = box_office_df.pivot(index='tconst', columns='region', values=['inflated_values', 'value'])

In [44]:
box_office_pivot_df

Unnamed: 0_level_0,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,...,value,value,value,value,value,value,value,value,value,value
region,Albania,Argentina,Aruba,Australia,Austria,Bahrain,Baltic States,Bangladesh,Belgium,Bolivia,...,Türkiye,Ukraine,United Arab Emirates,United Kingdom,Uruguay,Venezuela,Vietnam,West Indies,Worldwide,Yugoslavia
tconst,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
tt0035423,,277_749,,5_944_883,840_925,,,,,,...,,,,292444,,,,,,
tt0118301,,,,,,,,,,,...,,,,,,,,,,
tt0118589,,,,63_434,,,,,,,...,,,,26865,,,,,5272594,
tt0118635,,,,,,,,,,,...,,,,,,,,,,
tt0118636,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9883996,,,,2_242_455,,,,,,,...,,,,997705,,,,,,
tt9896876,,,,22_711,,,,,,,...,,,,8256,,,,,,
tt9907782,,,,,,,,,,,...,,,,,,,,,,
tt9908390,,,,,,,,,,,...,,28076,,,,,,,,


In [45]:
box_office_pivot_df.reset_index(inplace=True)

Unnamed: 0_level_0,tconst,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,inflated_values,...,value,value,value,value,value,value,value,value,value,value
region,Unnamed: 1_level_1,Albania,Argentina,Aruba,Australia,Austria,Bahrain,Baltic States,Bangladesh,Belgium,...,Türkiye,Ukraine,United Arab Emirates,United Kingdom,Uruguay,Venezuela,Vietnam,West Indies,Worldwide,Yugoslavia
0,tt0035423,,277_749,,5_944_883,840_925,,,,,...,,,,292444,,,,,,
1,tt0118301,,,,,,,,,,...,,,,,,,,,,
2,tt0118589,,,,63_434,,,,,,...,,,,26865,,,,,5272594,
3,tt0118635,,,,,,,,,,...,,,,,,,,,,
4,tt0118636,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6153,tt9883996,,,,2_242_455,,,,,,...,,,,997705,,,,,,
6154,tt9896876,,,,22_711,,,,,,...,,,,8256,,,,,,
6155,tt9907782,,,,,,,,,,...,,,,,,,,,,
6156,tt9908390,,,,,,,,,,...,,28076,,,,,,,,


at this point we upload ...

And now the Notebook combine_box_office_worldwide (Jonas Branch)

### Once that is done, We recreated the Long format melting on Gian_Luca's branch Notebook: Box_office_to_long

# And now the final step: Correcting country names Happens on Torbens Branch in "box_office_country_correction_final". THat Noteboook alos contains everythign else that is in this one here.

In [46]:
box_inflated_query = f'''   SELECT *
                    FROM {schema}."box_office_data_inflated"
                    '''
box_inflated_df = sqlf.get_dataframe(box_inflated_query)
box_inflated_df.head()

Unnamed: 0,tconst,country,values,inflated_values
0,tt7399138,Albania,483,566
1,tt10223460,Albania,30_528,32_280
2,tt1051906,Albania,3_359,3_875
3,tt10665342,Albania,9_906,10_475
4,tt10954984,Albania,4_186,4_426
