In [38]:
import pandas as pd

In [39]:
df = pd.read_csv('datasets/imdb_original/IMDB_Scraped.csv', dtype={'num_of_rating': str}, index_col=0)

# Converting non-USD to USD in `budget` column

In [40]:
budget = df['budget']

In [41]:
from forex_python.converter import CurrencyRates
c = CurrencyRates()

#  getting list of currencies
curr = budget.apply(lambda x: re.sub(r'\d+', '', x[:-12]).replace(',', '') if not pd.isna(x) else None).value_counts()

exchange = pd.DataFrame(columns=['currency', 'rate'])
cname = [
    'USD', 'EUR', 'INR', 'GBP', 'CAD', 'AUD', 'SEK ', 'NOK ', 'DEM ', 'RUR ', 'ZAR',
    'DKK', 'FRF ', 'FIM ', 'EGP ', 'HUF ', 'TRL', 'PLN', 'IRR ', 'CNY',
    'JPY', 'KRW', 'HKD', 'BDT ', 'ITL', 'CZK ', 'IDR', 'ESP ', 'MYR ', 'MXN',
    'NZD', 'NLG ', 'SGD ', 'CHF ', 'THB ', 'ARS ', 'PTE ', 'UAH ', 'ROL ',
    'ZAR ', 'NTD', 'ISK ', 'DOP ', 'PKR ', 'PHP', 'ILS', 'BGL ', 'HRK ', 'EEK',
    'AZM ', 'RON ', 'LVL ', 'BEF ', 'LKR ', 'LTL ', 'COP ', 'NPR ', 'AED ',
    'IEP ', 'CLP ', 'VND', 'YUM ', 'SIT ', 'GRD ', 'NGN ', 'VEB ', 'MVR ',
    'ATS ', 'MTL ', 'SAR ', 'PYG ', 'XAU ', 'MNT ', 'ALL ', 'IQD ', 'KWD ',
    'BND ', 'GEL ', 'JMD ', 'TMM '
    ]

In [42]:
#  getting currency rate for every curr in the curr list
rate = {}
for cu in list(curr.index):
    try:
        rate[cu] = c.convert(cname[list(curr.index).index(cu)].strip(), 'USD', 1)
    except:
        rate[cu] = None
rate

{'$': 1.0,
 '€': 1.0858,
 '₹': 0.012251829368057006,
 '£': 1.2287253304363572,
 'CA$': 0.7479506785148448,
 'A$': 0.7018745959922431,
 'SEK\xa0': 0.09782422631650074,
 'NOK\xa0': 0.10129392777513457,
 'DEM\xa0': None,
 'RUR\xa0': None,
 'R$': 0.0578516359683942,
 'DKK\xa0': 0.14594282181212115,
 'FRF\xa0': None,
 'FIM\xa0': None,
 'EGP\xa0': None,
 'HUF\xa0': 0.0027327410464852895,
 'TRL\xa0': 7.418037250844135e-07,
 'PLN\xa0': 0.23038404413324848,
 'IRR\xa0': None,
 'CN¥': 0.1474430352244643,
 '¥': 0.007675125468297166,
 '₩': 0.0008093321407274897,
 'HK$': 0.12772014021220035,
 'BDT\xa0': None,
 'ITL\xa0': None,
 'CZK\xa0': 0.045480438971265816,
 'IDR\xa0': 6.678126950836611e-05,
 'ESP\xa0': None,
 'MYR\xa0': 0.2337667929727868,
 'MX$': 0.053068625582225096,
 'NZ$': 0.6485098250014932,
 'NLG\xa0': None,
 'SGD\xa0': 0.7573411452884147,
 'CHF\xa0': 1.080075599323585,
 'THB\xa0': 0.03049229127467775,
 'ARS\xa0': None,
 'PTE\xa0': None,
 'UAH\xa0': None,
 'ROL\xa0': 3.3560921454343604e-05

In [43]:
#  converting 'budget' column
def change(x):
    if not pd.isna(x):
        try:
            amount = int(re.sub(r'[^0-9]', '', x))
            return amount*rate[re.sub(r'\d+', '', x[:-12]).replace(',', '')]
        except:
            return None
    else:
        return None
df['budget'] = budget.apply(change)

In [44]:
df['budget'].info()

<class 'pandas.core.series.Series'>
Int64Index: 113643 entries, 0 to 113642
Series name: budget
Non-Null Count  Dtype  
--------------  -----  
20581 non-null  float64
dtypes: float64(1)
memory usage: 1.7 MB


# Convert `num_of_rating` to int

In [45]:
def convert_num__of_rating(x):
    conv = {'M': 10**6, 'K': 10**3}
    if not pd.isna(x):
        try:
            return float(x[:-1])*conv[x[-1]]
        except KeyError:
            return float(x)
    else:
        return None
df['num_of_rating'] = df['num_of_rating'].apply(convert_num__of_rating)
df['num_of_rating']

0         2700000.0
1         2700000.0
2         2400000.0
3         2100000.0
4         2100000.0
            ...    
113638        181.0
113639        181.0
113640        181.0
113641        181.0
113642        181.0
Name: num_of_rating, Length: 113643, dtype: float64

# Adjust `budget` for inflation

In [47]:
#  pip install cpi
import cpi
cpi.update()

In [78]:
import re
def extract_year_from_title(title):
    t = title.split(' ')
    if re.search(r'\(\d+\)', t[-1]):
        year = t[-1].strip('()')
        year = int(year)
        return year
    return None

In [82]:
#  adding year column to dataframe
df['year'] = df['title'].apply(extract_year_from_title)

In [155]:
df['inflated_budget'] = df.apply(lambda x: x.budget if x.year == 2023 else cpi.inflate(x.budget, int(x.year)) if x.year > 1912 else None, axis=1)

In [170]:
df[df.year < 1950].sort_values(by=['budget'], ascending=False).head(10)

Unnamed: 0,title,genre,runtime,certificate,rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,producer,composer,cinematographer,editor,year,inflated_budget
7271,Duel in the Sun (1946),"Drama, Romance, Western",129 min,Passed,6.8,8900.0,8000000.0,"$20,428,771",United States,"Jennifer Jones,Joseph Cotten,Gregory Peck,Lion...","King Vidor,Otto Brower,William Dieterle,Sidney...","David O. Selznick,Niven Busch,Oliver H.P. Garr...",David O. Selznick,Dimitri Tiomkin,"Lee Garmes,Ray Rennahan,Harold Rosson",,1946.0,120063600.0
31728,Forever Amber (1947),"Adventure, Drama, Romance",138 min,Approved,6.5,1500.0,6000000.0,,United States,"Linda Darnell,Cornel Wilde,Richard Greene,Geor...","Otto Preminger,John M. Stahl","Philip Dunne,Ring Lardner Jr.,Jerome Cady,Kath...","William Perlberg,Darryl F. Zanuck",David Raksin,Leon Shamroy,Louis R. Loeffler,1947.0,78741260.0
20850,Unconquered (1947),"Adventure, Drama, History",146 min,Passed,6.8,2600.0,5000000.0,,United States,"Gary Cooper,Paulette Goddard,Howard Da Silva,B...",Cecil B. DeMille,"Charles Bennett,Fredric M. Frank,Jesse Lasky J...",Cecil B. DeMille,Victor Young,Ray Rennahan,Anne Bauchens,1947.0,65617710.0
10640,Life with Father (1947),"Comedy, Family",118 min,Passed,7.1,5800.0,4700000.0,,United States,"William Powell,Irene Dunne,Elizabeth Taylor,Ed...",Michael Curtiz,"Clarence Day,Donald Ogden Stewart,Howard Linds...","Robert Buckner,Jack L. Warner",Max Steiner,"J. Peverell Marley,William V. Skall",George Amy,1947.0,61680650.0
16446,Joan of Arc (1948),"Biography, Drama, War",145 min,Approved,6.4,3500.0,4600000.0,,United States,"Ingrid Bergman,Selena Royle,Robert Barrat,Jimm...",Victor Fleming,"Maxwell Anderson,Maxwell Anderson,Andrew Solt",Walter Wanger,Hugo Friedhofer,"Winton C. Hoch,William V. Skall,Joseph A. Vale...",Frank Sullivan,1948.0,55859460.0
24442,Captain from Castile (1947),"Adventure, Drama",140 min,Passed,6.8,2100.0,4500000.0,,United States,"Tyrone Power,Jean Peters,Cesar Romero,Lee J. C...",Henry King,"Lamar Trotti,Samuel Shellabarger,John Tucker B...","Lamar Trotti,Darryl F. Zanuck",Alfred Newman,"Arthur E. Arling,Charles G. Clarke,Joseph LaSh...",Barbara McLean,1947.0,59055940.0
27205,Prince of Foxes (1949),"Adventure, Drama, Romance",107 min,Approved,6.9,1800.0,4500000.0,,United States,"Tyrone Power,Orson Welles,Wanda Hendrix,Marina...",Henry King,"Milton Krims,Samuel Shellabarger",Sol C. Siegel,Alfred Newman,Leon Shamroy,Barbara McLean,1949.0,55333930.0
9738,The Three Musketeers (1948),"Action, Adventure, Drama",125 min,Not Rated,7.1,6500.0,4474000.0,,United States,"Lana Turner,Gene Kelly,June Allyson,Van Heflin...",George Sidney,"Alexandre Dumas,Robert Ardrey",Pandro S. Berman,Herbert Stothart,Robert H. Planck,"George Boemler,Robert Kern",1948.0,54329400.0
46430,Desire Me (1947),"Drama, Romance, War",91 min,Approved,6.0,837.0,4149000.0,,United States,"Greer Garson,Robert Mitchum,Richard Hart,Morri...","Jack Conway,George Cukor,Mervyn LeRoy,Victor S...","Marguerite Roberts,Zoe Akins,Casey Robinson,Le...",Arthur Hornblow Jr.,Herbert Stothart,Joseph Ruttenberg,Joseph Dervin,1947.0,54449580.0
27662,The Emperor Waltz (1948),"Comedy, Musical, Romance",106 min,Approved,6.0,1800.0,4070248.0,,United States,"Bing Crosby,Joan Fontaine,Roland Culver,Lucile...",Billy Wilder,"Charles Brackett,Billy Wilder",Charles Brackett,Victor Young,George Barnes,,1948.0,49426490.0


In [175]:
df['worldwide_gross'].apply(lambda x: x[0:1] if not pd.isna(x) else None).value_counts()

$    27468
₹       46
£        3
P        1
F        1
H        1
N        1
Name: worldwide_gross, dtype: int64