### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Scraping IMDb
import re
import imdb
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

In [None]:
# Dataset
df = pd.read_csv('./data/movies.csv')

### Functions

#### 1.1 --- check_non_numeric_values

In [None]:
def check_non_numeric_values(df, column):
    """Function takes in dataset and column. No kreturn, Printing out found non numeric values in the column."""

    # convert column to numeric data type
    numeric_col = pd.to_numeric(df[column], errors='coerce')

    # get the non-numeric values and their counts
    non_numeric_values = df[column][numeric_col.isna()].value_counts()

    # check if there are any non-numeric values
    if non_numeric_values.empty:
        print("No non numeric values in that column.")
    else:
        # create a table with non-numeric values and their counts
        non_numeric_table = pd.DataFrame({'Non-Numeric Value': non_numeric_values.index,
                                          'Count': non_numeric_values.values})

        # display the table
        print(non_numeric_table)

#### 1.2 --- get_mean_median_for_column

In [None]:
def get_mean_median_for_column(df, col_name):
    '''
    This function takes in a pandas dataframe and the name of a column in the dataframe,
    and returns the mean and median of the numeric values in the column that are not equal to 0.
    Non-numeric values are converted to 0 before calculating the mean and median.
    
    Parameters:
    - df: pandas dataframe
    - col_name: str, name of column to be processed
    
    Returns:
    - tuple of two floats: mean and median of numeric values in the column that are not equal to 0
    '''
    # Convert non-numeric values to 0
    df[col_name] = pd.to_numeric(df[col_name], errors='coerce').fillna(0)
    
    # Get the non-zero numeric values in the column
    non_zero_vals = df[col_name][df[col_name] != 0]
    
    # Calculate the mean and median of the non-zero values
    col_mean = non_zero_vals.mean()
    col_median = non_zero_vals.median()
    
    return col_mean, col_median


#### 1.3 --- replace_missing_values

In [None]:
def replace_missing_values(df, col_name, stat='mean'):
    '''
    This function takes in a pandas dataframe and the name of a column in the dataframe.
    It drops all rows where the value of the column is 0, and replaces those values with either 
    the mean or median of the rest of the values in the column, as specified by the user.
    It also replaces any NaN values in the column with the same statistic as the missing values.
    
    Parameters:
    - df: pandas dataframe
    - col_name: str, name of column to be processed
    - stat: str, either 'mean' or 'median', determines which statistic to use
    
    Returns:
    - df: pandas dataframe with modified column
    '''
    # Calculate the selected statistic of the non-zero/non-NaN values in the column
    if stat == 'mean':
        stat_val = np.nanmean(df[df[col_name].notnull() & (df[col_name] != 0)][col_name])
    elif stat == 'median':
        stat_val = np.nanmedian(df[df[col_name].notnull() & (df[col_name] != 0)][col_name])
    else:
        raise ValueError("stat must be either 'mean' or 'median'")
    
    # Replace the missing values (0 or NaN) with the selected statistic
    df.loc[(df[col_name] == 0) | (df[col_name].isnull()), col_name] = stat_val
    
    return df

#### 2.1 --- convert_to_usd

In [None]:
def convert_to_usd(amount):
    amount.replace(' ', '')
    amount.replace('\xa0', '')
    if amount.startswith('$'):
        amount = amount.strip('$').replace(',', '')   # must remove commas
        return float(amount)   # convert str into float
    elif amount.startswith('€'):
        # Exchange rate for EUR to USD
        amount = amount.strip('€').replace(',', '')
        return float(amount) * 1.06 
    elif amount.startswith('¥'):
        # Exchange rate for YEN to USD
        amount = amount.strip('¥').replace(',', '')
        return float(amount) * 0.0075
    elif amount.startswith('₹'):
        # Exchange rate for RPL to USD
        amount = amount.strip('₹').replace(',', '')
        return float(amount) * 0.012 
    elif amount.startswith('SEK'):
        # Exchange rate for SEK to USD
        amount = amount.strip('SEK').replace(',', '')
        return float(amount) * 0.094
    elif amount.startswith('DKK'):
        # Exchange rate for RPL to USD
        amount = amount.strip('DKK').replace(',', '')
        return float(amount) * 0.14
    elif amount.startswith('£'):
        # Exchange rate for RPL to USD
        amount = amount.strip('£').replace(',', '')
        return float(amount) * 1.21  
    else:
        return None

#### 2.2 --- adjust_for_inflation

In [None]:
def adjust_for_inflation(df, column_name, year_column, new_column, drop_original=True):
    data = {
        "1990": 5.398,
        "1991": 4.235,
        "1992": 3.0288,
        "1993": 2.9517,
        "1994": 2.6074,
        "1995": 2.8054,
        "1996": 2.9312,
        "1997": 2.3377,
        "1998": 1.5523,
        "1999": 2.188,
        "2000": 3.3769,
        "2001": 2.8262,
        "2002": 1.586,
        "2003": 2.2701,
        "2004": 2.6772,
        "2005": 3.3927,
        "2006": 3.2259,
        "2007": 2.8527,
        "2008": 3.8391,
        "2009": -0.3555,
        "2010": 1.64,
        "2011": 3.1568,
        "2012": 2.0693,
        "2013": 1.4648,
        "2014": 1.6222,
        "2015": 0.1186,
        "2016": 1.2616,
        "2017": 2.1301,
        "2018": 2.4426,
        "2019": 1.8122,
        "2020": 1.2336,
        "2021": 4.6979
    }
    
    # Create a new column in the DataFrame to store the adjusted values
    df[new_column] = 0

    # Loop over the rows in the DataFrame
    for index, row in df.iterrows():
        # Get the year from the row
        year = row[year_column]
        # Skip the row if the value in the specified column is NaN
        if pd.isna(row[column_name]):
            continue
        # Get the inflation rate for each year from the dictionary
        inflation_rates = [data[str(yr)] for yr in range(year, 2022)]
        # Calculate the total inflation factor by multiplying the inflation rates together
        total_inflation_factor = 1
        for rate in inflation_rates:
            total_inflation_factor *= 1 + (rate / 100)
        # Get the value from the specified column
        value = row[column_name]
        # Adjust the value for inflation using the total inflation factor
        adjusted_value = value * total_inflation_factor
        # Round the result to two decimal places and store it in the new column
        df.at[index, new_column] = round(adjusted_value, 2)

    # Drop the original column if specified
    if drop_original:
        df = df.drop(columns=[column_name])

    # Return the DataFrame with the adjusted values
    return df


#### 3.1 --- one_hot_encoding_column

In [None]:
def one_hot_encoding_column(dataset, column, separator=", ", prefix=""):
    """
    Performs one-hot encoding on the specified column of the given dataset.
    dataset: The dataset to be processed.
    column: The name of the column to be one-hot encoded.
    separator: The separator used in the values of the specified column. Defaults to ",".
    prefix: Optional string to be added in front of each new column name. Defaults to "".
    returns: the new dataset with the specified column one-hot encoded.
    """

    # 1. Creating a list with all the values mentioned in the dataset
    value_list = [values.split(separator) for values in dataset[column]]

    # 2. Creating a set with value categories
    unique_v = {value for values in value_list for value in values}

    # 3. Performing one-hot encoding using get_dummies method
    value_subtable = pd.get_dummies(dataset[column].str.split(separator, expand=True).stack()).reset_index(level=1, drop=True)
    value_subtable = value_subtable.groupby(value_subtable.index).sum()

    # 4. Adding the prefix to the column names
    if prefix:
        value_subtable.columns = [prefix + str(col) for col in value_subtable.columns]

    # 5. Merging the subtable with the main dataset
    dataset_processed = pd.merge(dataset, value_subtable, left_index=True, right_index=True, how='left')
    dataset_processed.drop(columns=[column], inplace=True)

    # 6. Returning the new dataset
    return dataset_processed

#### 3.2 --- one_hot_coding_binary

In [None]:
def one_hot_coding_binary(dataset, original_column, prefix, file_column, file_location, separator=", ", num_categories=1, drop_original=False):
    if num_categories not in range(1,5):
        raise ValueError("num_categories must be between 1 and 4")

    for i in range(1, num_categories+1):
        dataset[f"{prefix}_no_{i}"] = dataset[original_column].str.split(separator, expand=True)[i-1]

    dataset_categories = pd.read_csv(file_location)

    for i in range(1, num_categories+1):
        replace = dataset[f"{prefix}_no_{i}"].isin(dataset_categories[file_column])
        dataset[f"{prefix}_no_{i}_binary"] = replace.astype(int)

    if drop_original:
        dataset.drop(columns=[original_column], inplace=True)

    if num_categories == 1:
        dataset.drop(columns=[f"{prefix}_no_1"], inplace=True)
        dataset.rename(columns={f"{prefix}_no_1_binary": f"{prefix}"}, inplace=True)
    else:
        for i in range(1, num_categories+1):
            dataset.drop(columns=[f"{prefix}_no_{i}"], inplace=True)
            dataset.rename(columns={f"{prefix}_no_{i}_binary": f"{prefix}_{i}"}, inplace=True)

        if num_categories == 5:
            dataset.rename(columns={f"{prefix}_all_binary": f"{prefix}_all"}, inplace=True)

    return dataset


#### 4.1 --- convert_imdb_str_to_int

In [None]:
def get_value(val):
    ''' 
    Converts input string to an integer.
    '''
    # 1. Check if the last character of the string is a k or m and if
    # not it converts the string to an int
    if val[-1].lower() not in ['k', 'K', 'm', 'M']:
        return int(val)
    multiplier = val[-1].lower()
    # 2. If the string ends with K multiply by 1000
    if multiplier == "k" or multiplier == "K":
        value = float(val[:-1]) * 1000
        return value
    # 3. If the string ends with M multiply by 1000000 
    elif multiplier == "m" or multiplier == "M":
        value = float(val[:-1]) * 1000000
        return value

#### 4.2 --- scraping_imdb_for_movie_info

In [None]:
def searching_IMDb_movie(title_column, dataset):
    """
    Updating the dataset by adding two new columns for each movie 
    including metascore and number of ratings. 
    
    Parameters:
    name_column : the column containing the person's name
    dataset : dataset
    person_type : must specify for IMDb if person is 'actor'/'director'
    """
    # 1. Instantiating imdb 
    ia = imdb.IMDb()

    # 2. For each row of the df, extract the nam for the specific column
    for index, row in dataset.iterrows():
        try:
            # Get the movie title
            movie_title = row[title_column]
            
            # Search for a movie on IMDb by title
            movie = ia.search_movie(movie_title)
            movie_id = movie[0].getID()            

            # Make a request to the URL and get the HTML content

            # Request for movie (different webpage)
            movie_req = Request(
                url = f'https://www.imdb.com/title/tt{movie_id}/', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            movie_html = urlopen(movie_req).read()
            movie_soup = BeautifulSoup(movie_html, 'html.parser')

            # Request for metascore (different webpage)
            meta_req = Request(
                url = f'https://www.imdb.com/title/tt{movie_id}/criticreviews/?ref_=tt_ov_rt', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            meta_html = urlopen(meta_req).read()
            meta_soup = BeautifulSoup(meta_html, 'html.parser')
            
            # If metascore exists try to extract info
            try:
                metascore_line = meta_soup.find('div', {'class': 'sc-79ae5a4-0 kUTYKi'}).text.split()
                metascore = get_value(metascore_line[0].split('M')[0])
            except AttributeError:
                metascore = None

            # If number of votes exists try to extract info
            try:
                num_votes_line = movie_soup.find('div', {'class': 'sc-e457ee34-3 frEfSL'}).text.split()
                num_votes = get_value(num_votes_line[0])
            except AttributeError:
                num_votes = None

        except imdb._exceptions.IMDbError:
            metascore = None
            num_votes = None
        except TimeoutError:
            metascore = None
            num_votes = None

        # Make a new column and append the new data
        dataset.loc[index, f'{title_column}_metascore'] = metascore
        dataset.loc[index, f'{title_column}_num_votes'] = num_votes
    
        print(f'{index}. {movie_title}')

#### 4.3 --- scraping_imdb_for_person_info

In [None]:
def searching_IMDb_person(name_column, dataset, person_type):
    """
    Updating the dataset by adding new column for a director/actor
    including; number of movies, commulative average rating for these
    movies, number of won awards, number of nominations. 
    
    Parameters:
    name_column : the column containing the person's name
    dataset : dataset
    person_type : specify if person is actor or director due to different
        search filters applied
    """
    # 1. Instantiating imdb 
    ia = imdb.IMDb()

    # 2. For each row of the df, extract the name for specific column
    for index, row in dataset.iterrows():
        name = row[name_column]
        person_id = None  # initialize person_id to None
        
        try:
            name_search = ia.search_person(name)
            person_id = name_search[0].getID()
        except:
            pass
        
        if person_id != None:
            awards_req = Request(
                url = f'https://www.imdb.com/name/nm{person_id}/awards/?ref_=nm_awd', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            awards_html = urlopen(awards_req).read()
            awards_soup = BeautifulSoup(awards_html, 'html.parser')

            # If awards and nominations exist then get data
            try:
                awards_line = awards_soup.find('div', {'class': 'desc'}).text.split()
                wins = get_value(awards_line[2])
                nominations = get_value(awards_line[5])
            except AttributeError:
                wins = None
                nominations = None

            # Movie history
            # Make a request to the URL and get the HTML
            movies_req = Request(
                url = f'https://www.imdb.com/filmosearch/?explore=title_type&role=nm{person_id}&ref_=filmo_ref_job_typ&sort=release_date,asc&mode=detail&page=1&job_type={person_type}', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            movie_html = urlopen(movies_req).read()
            movie_soup = BeautifulSoup(movie_html, 'html.parser')

            # If movie history exists
            try:
                # Make a list of the webpage and create ratings_list
                movies_list = movie_soup.find_all('div', {'class': 'lister-item mode-detail'})
                ratings_list = []
                for movie in movies_list:
                    # For movie extract the year
                    year_raw = movie.find('span', {'class': 'lister-item-year'}).text.strip('()')
                    year_match = re.match(r'^\d{4}$', year_raw)
                    # Only keep movie rating if movie pre-2023
                    if year_match and int(year_raw) < 2023:
                        rating = movie.find('div', {'class': 'ratings-bar'}).find('div', {'class': 'inline-block ratings-imdb-rating'})
                        if rating is not None:
                            # Append rating to ratings_list
                            ratings_list.append(float(rating['data-value']))
            except AttributeError:
                ratings_list = []
        else:
            wins = None
            nominations = None
            ratings_list = []


        # Create new columns and append data to appropriate column
        dataset.loc[index, f'{name_column}_num_wins'] = wins
        dataset.loc[index, f'{name_column}_num_nominations'] = nominations
        dataset.loc[index, f'{name_column}_num_movies'] = len(ratings_list)
        dataset.loc[index, f'{name_column}_avg_rating'] = np.mean(ratings_list)

        print(f'{index}. {name}')

    return dataset

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              2000 non-null   object 
 1   Rating             1999 non-null   float64
 2   Year               2000 non-null   int64  
 3   Month              2000 non-null   object 
 4   Certificate        1966 non-null   object 
 5   Runtime            2000 non-null   object 
 6   Directors          2000 non-null   object 
 7   Stars              2000 non-null   object 
 8   Genre              2000 non-null   object 
 9   Filming_location   2000 non-null   object 
 10  Budget             2000 non-null   object 
 11  Income             2000 non-null   object 
 12  Country_of_origin  2000 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 203.2+ KB


In [None]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin
0,Avatar: The Way of Water,7.8,2022,December,PG-13,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States
1,Guillermo del Toro's Pinocchio,7.6,2022,December,PG,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France"
2,Bullet Train,7.3,2022,August,R,127,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Action, Comedy, Thriller",Japan,"$85,900,000","$239,268,602","Japan, United States"
3,The Banshees of Inisherin,7.8,2022,November,R,114,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Comedy, Drama",Ireland,Unknown,"$19,720,823","Ireland, United Kingdom, United States"
4,M3gan,6.4,2022,December,PG-13,102,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...","Horror, Sci-Fi, Thriller",New Zealand,"$12,000,000","$171,253,910",United States


### 1.Title

In [None]:
rows_pre_title = len(df)
col_pre_title = df.shape[1]

#### 1.1Unqiue/ Dupclicate values

In [None]:
num_unique_values = df['Title'].nunique()

In [None]:
print(num_unique_values)

1989


In [None]:
duplicates = df[df.duplicated(['Title'], keep=False)].sort_values(by=['Title'])

duplicates

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin
1610,Black Christmas,4.6,2006,December,R,95,Glen Morgan,"Michelle Trachtenberg, Mary Elizabeth Winstead...",Horror,Canada,"$9,000,000","$21,510,851","Canada, United States"
348,Black Christmas,3.5,2019,December,PG-13,92,Sophia Takal,"Imogen Poots, Aleyse Shannon, Lily Donoghue, B...","Horror, Mystery, Thriller",New Zealand,"$5,000,000","$18,529,730","United States, New Zealand, Canada"
1746,Fantastic Four,5.7,2005,July,PG-13,106,Tim Story,"Ioan Gruffudd, Michael Chiklis, Chris Evans, J...","Action, Adventure, Fantasy",Canada,"$100,000,000","$333,535,934","Germany, United States"
767,Fantastic Four,4.3,2015,August,PG-13,100,Josh Trank,"Miles Teller, Kate Mara, Michael B Jordan, Jam...","Action, Adventure, Sci-Fi",USA,"$120,000,000","$167,882,881","United States, Germany, United Kingdom"
1263,Frozen,6.2,2010,March,R,93,Adam Green,"Shawn Ashmore, Emma Bell, Kevin Zegers, Ed Ack...","Adventure, Drama, Mystery",USA,Unknown,"$3,843,774",United States
903,Frozen,7.4,2013,November,PG,102,"Chris Buck, Jennifer Lee","Kristen Bell, Idina Menzel, Jonathan Groff, Jo...","Animation, Adventure, Comedy",Norge,"$150,000,000","$1,304,550,716",United States
357,Hellboy,5.2,2019,April,R,120,Neil Marshall,"David Harbour, Milla Jovovich, Ian McShane, Sa...","Action, Adventure, Fantasy",Bulgaria,"$50,000,000","$55,065,289","United States, United Kingdom, Bulgaria, Canad..."
1843,Hellboy,6.8,2004,April,PG-13,122,Guillermo del Toro,"Ron Perlman, Doug Jones, Selma Blair, John Hurt","Action, Adventure, Fantasy",Czech Republic,"$66,000,000","$99,378,985",United States
1905,Oldboy,8.4,2003,November,R,120,Park Chan wook,"Choi Min sik, Yoo Ji tae, Kang Hye jeong, Kim ...","Action, Drama, Mystery",South Korea,"$3,000,000","$15,421,226",South Korea
920,Oldboy,5.8,2013,November,R,104,Spike Lee,"Josh Brolin, Elizabeth Olsen, Samuel L Jackson...","Action, Drama, Mystery",USA,"$30,000,000","$5,186,767",United States


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              2000 non-null   object 
 1   Rating             1999 non-null   float64
 2   Year               2000 non-null   int64  
 3   Month              2000 non-null   object 
 4   Certificate        1966 non-null   object 
 5   Runtime            2000 non-null   object 
 6   Directors          2000 non-null   object 
 7   Stars              2000 non-null   object 
 8   Genre              2000 non-null   object 
 9   Filming_location   2000 non-null   object 
 10  Budget             2000 non-null   object 
 11  Income             2000 non-null   object 
 12  Country_of_origin  2000 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 203.2+ KB


In [None]:
rows_post_title = len(df)
col_post_title = df.shape[1]

### 2.Rating

In [None]:
rows_pre_rating = len(df)
col_pre_rating = df.shape[1]

#### 2.1 Missing values

In [None]:
#dropping all rows that have missing values in the column Rating
df = df.dropna(subset=['Rating'])

In [None]:
#A quick check to see that the row was removed from the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1999 non-null   object 
 1   Rating             1999 non-null   float64
 2   Year               1999 non-null   int64  
 3   Month              1999 non-null   object 
 4   Certificate        1965 non-null   object 
 5   Runtime            1999 non-null   object 
 6   Directors          1999 non-null   object 
 7   Stars              1999 non-null   object 
 8   Genre              1999 non-null   object 
 9   Filming_location   1999 non-null   object 
 10  Budget             1999 non-null   object 
 11  Income             1999 non-null   object 
 12  Country_of_origin  1999 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 218.6+ KB


#### 2.2 Outliers

In [None]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Rating']
count = column_values.between(0, 10).sum()

In [None]:
print(count)

1999


In [None]:
rows_post_rating = len(df)
col_post_rating = df.shape[1]

### 3.Year

In [None]:
rows_pre_year = len(df)
col_pre_year = df.shape[1]

#### 3.1 Outliers

In [None]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Year']
count = column_values.between(2002, 2023).sum()

In [None]:
print(count)

1999


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1999 non-null   object 
 1   Rating             1999 non-null   float64
 2   Year               1999 non-null   int64  
 3   Month              1999 non-null   object 
 4   Certificate        1965 non-null   object 
 5   Runtime            1999 non-null   object 
 6   Directors          1999 non-null   object 
 7   Stars              1999 non-null   object 
 8   Genre              1999 non-null   object 
 9   Filming_location   1999 non-null   object 
 10  Budget             1999 non-null   object 
 11  Income             1999 non-null   object 
 12  Country_of_origin  1999 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 218.6+ KB


In [None]:
rows_post_year = len(df)
col_post_year = df.shape[1]

### 4.Month

In [None]:
rows_pre_month = len(df)
col_pre_month = df.shape[1]

#### 4.1 Unqiue values

In [None]:
# Check all unique values in the column Month
column_values = df['Month'].unique()
print(column_values)

['December' 'August' 'November' 'October' 'March' 'September' 'May'
 'April' 'January' 'July' 'June' 'February' '2014' '2008']


In [None]:
# Count number of values that aren't one of the 12 months
count = sum(value not in ['January', 'February', 'March', 'April', 'May', 'June', 
                          'July', 'August', 'September', 'October', 'November', 'December']
            for value in column_values)
print(count)

2


In [None]:
#Drop the unwanted rows
# List of valid months
valid_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
                'August', 'September', 'October', 'November', 'December']
# Drop rows with invalid months from the original dataframe
df.drop(index=df[~df['Month'].isin(valid_months)].index, inplace=True)

In [None]:
#Check that it has been dropped
column_values = df['Month'].unique()
print(column_values)

['December' 'August' 'November' 'October' 'March' 'September' 'May'
 'April' 'January' 'July' 'June' 'February']


In [None]:
#checking the rows have been dropped
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1997 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1997 non-null   object 
 1   Rating             1997 non-null   float64
 2   Year               1997 non-null   int64  
 3   Month              1997 non-null   object 
 4   Certificate        1964 non-null   object 
 5   Runtime            1997 non-null   object 
 6   Directors          1997 non-null   object 
 7   Stars              1997 non-null   object 
 8   Genre              1997 non-null   object 
 9   Filming_location   1997 non-null   object 
 10  Budget             1997 non-null   object 
 11  Income             1997 non-null   object 
 12  Country_of_origin  1997 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 218.4+ KB


#### 4.5 Converting row to numeric

In [None]:
# Define a dictionary to map months to integers
month_to_int = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
                'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

# Apply the map() method to convert the values
df['Month'] = df['Month'].map(month_to_int)

# Convert the type of the column to int
df['Month'] = df['Month'].astype(int)

In [None]:
#checking the month-values has been rplaced by 1-12 and the column converted to int
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1997 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1997 non-null   object 
 1   Rating             1997 non-null   float64
 2   Year               1997 non-null   int64  
 3   Month              1997 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1997 non-null   object 
 6   Directors          1997 non-null   object 
 7   Stars              1997 non-null   object 
 8   Genre              1997 non-null   object 
 9   Filming_location   1997 non-null   object 
 10  Budget             1997 non-null   object 
 11  Income             1997 non-null   object 
 12  Country_of_origin  1997 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 218.4+ KB


In [None]:
column_values = df['Month'].unique()
column_values.sort()
print(column_values)

[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [None]:
rows_post_month = len(df)
col_post_month = df.shape[1]

### 5.Runtime

In [None]:
rows_pre_runtime = len(df)
col_pre_runtime = df.shape[1]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1997 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1997 non-null   object 
 1   Rating             1997 non-null   float64
 2   Year               1997 non-null   int64  
 3   Month              1997 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1997 non-null   object 
 6   Directors          1997 non-null   object 
 7   Stars              1997 non-null   object 
 8   Genre              1997 non-null   object 
 9   Filming_location   1997 non-null   object 
 10  Budget             1997 non-null   object 
 11  Income             1997 non-null   object 
 12  Country_of_origin  1997 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 218.4+ KB


##### 5.1 Check non numeric

In [None]:
check_non_numeric_values(df, "Runtime")

  Non-Numeric Value  Count
0           Unknown      1


##### 5.2 Drop non numeric values

In [None]:
# Convert the 'Numbers' column to numeric values, converting non-numeric values to NaN
df['Runtime'] = pd.to_numeric(df['Runtime'], errors='coerce')
df = df.dropna(subset=['Runtime'])

In [None]:
#checking the type has been changed
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
dtypes: float64(2), int64(2), object(9)
memory usage: 218.3+ KB


#### 5.3 Outliers

In [None]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Runtime']
count = column_values.between(30, 300).sum()

In [None]:
print(count)

1996


In [None]:
rows_post_runtime = len(df)
col_post_runtime = df.shape[1]

### 7.Directors

In [None]:
rows_pre_directors = len(df)
col_pre_directors = df.shape[1]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
dtypes: float64(2), int64(2), object(9)
memory usage: 218.3+ KB


#### 7.1 one_hot_encoding

In [None]:
num_directors = 2
prefix_col = "top_50_director"
df = one_hot_coding_binary(df, "Directors", prefix_col, "Name", "./data/top_50_directors.csv", num_categories=num_directors, drop_original=False)


In [None]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin,top_50_director_1,top_50_director_2
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States,1,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France",1,0
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Action, Comedy, Thriller",Japan,"$85,900,000","$239,268,602","Japan, United States",0,0
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Comedy, Drama",Ireland,Unknown,"$19,720,823","Ireland, United Kingdom, United States",0,0
4,M3gan,6.4,2022,12,PG-13,102.0,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...","Horror, Sci-Fi, Thriller",New Zealand,"$12,000,000","$171,253,910",United States,0,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
 13  top_50_director_1  1996 non-null   int64  
 14  top_50_director_2  1996 non-null   int64  
dtypes: float64(2), int64(4), object(9)
memory usage: 249.5+ KB


In [None]:
# Calculate and print sum of each new column.
sums = []
for i in range(1, num_directors+1):
    col_name = prefix_col + "_" + str(i)
    if col_name in df.columns:
        sums.append((col_name, df[col_name].sum()))

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
 13  top_50_director_1  1996 non-null   int64  
 14  top_50_director_2  1996 non-null   int64  
dtypes: float64(2), int64(4), object(9)
memory usage: 249.5+ KB


In [None]:
rows_post_directors = len(df)
col_post_directors = df.shape[1]

### 8.Stars

In [None]:
rows_pre_stars = len(df)
col_pre_stars = df.shape[1]

In [None]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin,top_50_director_1,top_50_director_2
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States,1,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France",1,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
 13  top_50_director_1  1996 non-null   int64  
 14  top_50_director_2  1996 non-null   int64  
dtypes: float64(2), int64(4), object(9)
memory usage: 249.5+ KB


#### 8.1 one_hot_encoding

In [None]:
num_stars = 4
prefix_col = "top_1000_Stars"
df = one_hot_coding_binary(df, "Stars", prefix_col, "Name", "./data/top_1000_actors.csv", num_categories=num_stars, drop_original=False)


In [None]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin,top_50_director_1,top_50_director_2,top_1000_Stars_1,top_1000_Stars_2,top_1000_Stars_3,top_1000_Stars_4
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States,1,0,1,1,1,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France",1,0,1,0,0,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
 13  top_50_director_1  1996 non-null   int64  
 14  top_50_director_2  1996 non-null   int64  
 15  top_1000_Stars_1   1996 non-null   int64  
 16  top_1000_Stars_2   1996 

In [None]:
# Calculate and print sum of each new column.
sums = []
for i in range(1, num_stars+1):
    col_name = prefix_col + "_" + str(i)
    if col_name in df.columns:
        sums.append((col_name, df[col_name].sum()))
        
print(sums)

[('top_1000_Stars_1', 1296), ('top_1000_Stars_2', 1019), ('top_1000_Stars_3', 825), ('top_1000_Stars_4', 578)]


In [None]:
rows_post_stars = len(df)
col_post_stars = df.shape[1]

### 9.Genre

In [None]:
rows_pre_genre = len(df)
col_pre_genre = df.shape[1]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
 13  top_50_director_1  1996 non-null   int64  
 14  top_50_director_2  1996 non-null   int64  
 15  top_1000_Stars_1   1996 non-null   int64  
 16  top_1000_Stars_2   1996 

#### 9.1 - one_hot_encoding

In [None]:
df = one_hot_encoding_column(df, "Genre", prefix = "genre_")

In [None]:
#checking new dataset
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Filming_location   1996 non-null   object 
 9   Budget             1996 non-null   object 
 10  Income             1996 non-null   object 
 11  Country_of_origin  1996 non-null   object 
 12  top_50_director_1  1996 non-null   int64  
 13  top_50_director_2  1996 non-null   int64  
 14  top_1000_Stars_1   1996 non-null   int64  
 15  top_1000_Stars_2   1996 non-null   int64  
 16  top_1000_Stars_3   1996 

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Filming_location,Budget,...,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",New Zealand,"$350,000,000",...,0,0,0,0,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...",USA,"$35,000,000",...,0,0,0,0,0,0,0,0,0,0


In [None]:
rows_post_genre = len(df)
col_post_genre = df.shape[1]

### 10.Filming_location

In [None]:
rows_pre_filming_location = len(df)
col_pre_filming_location = df.shape[1]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Filming_location   1996 non-null   object 
 9   Budget             1996 non-null   object 
 10  Income             1996 non-null   object 
 11  Country_of_origin  1996 non-null   object 
 12  top_50_director_1  1996 non-null   int64  
 13  top_50_director_2  1996 non-null   int64  
 14  top_1000_Stars_1   1996 non-null   int64  
 15  top_1000_Stars_2   1996 non-null   int64  
 16  top_1000_Stars_3   1996 

In [None]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Filming_location,Budget,...,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",New Zealand,"$350,000,000",...,0,0,0,0,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...",USA,"$35,000,000",...,0,0,0,0,0,0,0,0,0,0


In [None]:
#### 10.1 Unique Values
column_values = df['Filming_location'].value_counts().sort_values(ascending=False)
print(len(column_values))
print(column_values)

97
USA             904
Canada          208
UK              177
Unknown          75
Australia        44
               ... 
Ontario           1
Uganda            1
Malaysia          1
Saudi Arabia      1
Ecuador           1
Name: Filming_location, Length: 97, dtype: int64


In [None]:
#Seeing 75 movies with Unknown filming_location. How can we replace them? and seeing 97 unique filming locations.
#We we´re discussing if movies mostly are beeing done w/ green screen.
#maybe remove the whole column?

#### 10.2 - Drop the column

In [None]:
df = df.drop("Filming_location", axis=1)

In [None]:
#check that it has been removed
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   float64
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Budget             1996 non-null   object 
 9   Income             1996 non-null   object 
 10  Country_of_origin  1996 non-null   object 
 11  top_50_director_1  1996 non-null   int64  
 12  top_50_director_2  1996 non-null   int64  
 13  top_1000_Stars_1   1996 non-null   int64  
 14  top_1000_Stars_2   1996 non-null   int64  
 15  top_1000_Stars_3   1996 non-null   int64  
 16  top_1000_Stars_4   1996 

In [None]:
rows_post_filming_location = len(df)
col_post_filming_location = df.shape[1]

### 11.Budget / 12.Income

In [None]:
rows_pre_budget = len(df)
col_pre_budget = df.shape[1]
rows_pre_income = len(df)
col_pre_income = df.shape[1]

In [None]:
df.head(4)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Budget,Income,...,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","$350,000,000","$2,267,946,983",...,0,0,0,0,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","$35,000,000","$108,967",...,0,0,0,0,0,0,0,0,0,0
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","$85,900,000","$239,268,602",...,0,0,0,0,0,0,0,1,0,0
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...",Unknown,"$19,720,823",...,0,0,0,0,0,0,0,0,0,0


#### 1.1 convert to USD and strip of non numeric characters

In [None]:
df['Budget'] = df['Budget'].apply(convert_to_usd)
df['Income'] = df['Income'].apply(convert_to_usd)

In [None]:
#check the change
df.head(4)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Budget,Income,...,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",350000000.0,2267947000.0,...,0,0,0,0,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...",35000000.0,108967.0,...,0,0,0,0,0,0,0,0,0,0
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...",85900000.0,239268600.0,...,0,0,0,0,0,0,0,1,0,0
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...",,19720820.0,...,0,0,0,0,0,0,0,0,0,0


#### 1.2 --- calc with inflation

In [None]:
df = adjust_for_inflation(df, "Budget", "Year", "Budget_inf", drop_original=True)
df = adjust_for_inflation(df, "Income", "Year", "Income_inf", drop_original=True)

In [None]:
df.head(4)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,0,0,350000000.0,2267947000.0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,0,0,35000000.0,108967.0
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Japan, United States",0,...,0,0,0,0,0,1,0,0,85900000.0,239268600.0
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Ireland, United Kingdom, United States",0,...,0,0,0,0,0,0,0,0,0.0,19720820.0


#### 1.3 --- Outliers

In [None]:
#check number of rows in Budget > amount
count = ((df['Budget_inf'] > 0) & (df['Budget_inf'] < 50000)).sum()
print(count)
filtered_df = df[(df['Budget_inf'] > 0) & (df['Budget_inf'] < 50000)]
filtered_df.head()

6


Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf
477,The VelociPastor,5.0,2018,8,Not Rated,75.0,Brendan Steere,"Greg Cohan, George Schewnzer, Janice Young, Da...","United States, China",0,...,0,0,0,0,0,0,0,0,12160.06,0.0
487,Trautmann,7.2,2018,10,,120.0,Marcus H Rosenm ller,"David Kross, Freya Mavor, John Henshaw, Harry ...","United Kingdom, Germany",0,...,0,0,1,0,0,0,0,0,12.89,2118630.0
605,Terrifier,5.6,2016,3,Unrated,85.0,Damien Leone,"Jenna Kanell, Samantha Scaffidi, David Howard ...",United States,0,...,0,0,0,0,0,1,0,0,40013.79,87316.94
995,Banshee,8.5,2013,9,,50.0,Gemma Mc Carthy,"Jonathan O Dwyer, Sean Flood, Frank Hurley, Fi...",Ireland,0,...,0,0,0,0,0,0,0,0,375.31,0.0
1557,Paranormal Activity,6.3,2007,10,R,86.0,Oren Peli,"Katie Featherston, Micah Sloat, Mark Fredrichs...",United States,0,...,0,1,0,0,0,0,0,0,20162.26,259899300.0


In [None]:
# Drop rows
df.drop(filtered_df.index, inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1990 entries, 0 to 1999
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1990 non-null   object 
 1   Rating             1990 non-null   float64
 2   Year               1990 non-null   int64  
 3   Month              1990 non-null   int64  
 4   Certificate        1960 non-null   object 
 5   Runtime            1990 non-null   float64
 6   Directors          1990 non-null   object 
 7   Stars              1990 non-null   object 
 8   Country_of_origin  1990 non-null   object 
 9   top_50_director_1  1990 non-null   int64  
 10  top_50_director_2  1990 non-null   int64  
 11  top_1000_Stars_1   1990 non-null   int64  
 12  top_1000_Stars_2   1990 non-null   int64  
 13  top_1000_Stars_3   1990 non-null   int64  
 14  top_1000_Stars_4   1990 non-null   int64  
 15  genre_Action       1990 non-null   uint8  
 16  genre_Adventure    1990 

In [None]:
#check number of rows in Income < amount
count = ((df['Income_inf'] > 0) & (df['Income_inf'] < 50000)).sum()
print(count)
filtered_df = df[(df['Income_inf'] > 0) & (df['Income_inf'] < 50000)]
filtered_df.head()

21


Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf
26,Pinocchio,5.1,2022,9,PG,105.0,Robert Zemeckis,"Joseph Gordon Levitt, Tom Hanks, Benjamin Evan...",United States,1,...,0,0,0,0,0,0,0,0,150000000.0,37353.0
80,Bardo: False Chronicle of a Handful of Truths,7.0,2022,12,R,159.0,Alejandro G I rritu,"Daniel Gim nez Cacho, Griselda Siciliani, Xime...",Mexico,0,...,0,0,0,0,0,0,0,0,0.0,38190.0
83,The Good Nurse,6.8,2022,10,R,121.0,Tobias Lindholm,"Eddie Redmayne, Jessica Chastain, Denise Pillo...",United States,0,...,0,0,0,0,0,0,0,0,0.0,14943.0
101,Farha,8.4,2021,12,TV-14,92.0,Darin J Sallam,"Karam Taher, Ashraf Barhom, Ali Suliman, Tala ...","Jordan, Saudi Arabia, Sweden",0,...,0,0,0,0,0,0,0,0,0.0,736.03
151,Father Christmas Is Back,4.5,2021,11,PG-13,105.0,"Mick Davis, Philippe Martinez","Elizabeth Hurley, John Cleese, Kelsey Grammer,...",United Kingdom,0,...,0,0,0,0,0,0,0,0,0.0,6021.18


In [None]:
# drop rows
df.drop(filtered_df.index, inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1969 entries, 0 to 1999
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1969 non-null   object 
 1   Rating             1969 non-null   float64
 2   Year               1969 non-null   int64  
 3   Month              1969 non-null   int64  
 4   Certificate        1939 non-null   object 
 5   Runtime            1969 non-null   float64
 6   Directors          1969 non-null   object 
 7   Stars              1969 non-null   object 
 8   Country_of_origin  1969 non-null   object 
 9   top_50_director_1  1969 non-null   int64  
 10  top_50_director_2  1969 non-null   int64  
 11  top_1000_Stars_1   1969 non-null   int64  
 12  top_1000_Stars_2   1969 non-null   int64  
 13  top_1000_Stars_3   1969 non-null   int64  
 14  top_1000_Stars_4   1969 non-null   int64  
 15  genre_Action       1969 non-null   uint8  
 16  genre_Adventure    1969 

In [None]:
#checking the change
df.head(4)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,0,0,350000000.0,2267947000.0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,0,0,35000000.0,108967.0
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Japan, United States",0,...,0,0,0,0,0,1,0,0,85900000.0,239268600.0
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Ireland, United Kingdom, United States",0,...,0,0,0,0,0,0,0,0,0.0,19720820.0


### Notice -- The function to calculate inflation has turned all NaN into 0. (Had to be done to be able to calculate)

#### 1.4 --- Missing Values

In [None]:
#Since the missing values are 0 instead of NaN at this point (and 0 would also be treated like missing value )

# count the number of occurrences of 0 in col1
count_col1 = (df['Budget_inf'] == 0).sum()

# count the number of occurrences of 0 in col2
count_col2 = (df['Income_inf'] == 0).sum()

# count the number of occurrences of 0 in both col1 and col2
count_both = ((df['Budget_inf'] == 0) & (df['Income_inf'] == 0)).sum()

# print the results
print('Number of zeros in col1:', count_col1)
print('Number of zeros in col2:', count_col2)
print('Number of zeros in both col1 and col2:', count_both)

Number of zeros in col1: 299
Number of zeros in col2: 140
Number of zeros in both col1 and col2: 100


#### 1.5 --- Dropping rows

In [None]:
# Drop rows where both Budget_inf and Profit_inf are 0
df = df[(df['Budget_inf'] != 0) | (df['Income_inf'] != 0)]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1869 entries, 0 to 1999
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1869 non-null   object 
 1   Rating             1869 non-null   float64
 2   Year               1869 non-null   int64  
 3   Month              1869 non-null   int64  
 4   Certificate        1849 non-null   object 
 5   Runtime            1869 non-null   float64
 6   Directors          1869 non-null   object 
 7   Stars              1869 non-null   object 
 8   Country_of_origin  1869 non-null   object 
 9   top_50_director_1  1869 non-null   int64  
 10  top_50_director_2  1869 non-null   int64  
 11  top_1000_Stars_1   1869 non-null   int64  
 12  top_1000_Stars_2   1869 non-null   int64  
 13  top_1000_Stars_3   1869 non-null   int64  
 14  top_1000_Stars_4   1869 non-null   int64  
 15  genre_Action       1869 non-null   uint8  
 16  genre_Adventure    1869 

In [None]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,0,0,350000000.0,2267947000.0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,0,0,35000000.0,108967.0
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Japan, United States",0,...,0,0,0,0,0,1,0,0,85900000.0,239268600.0
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Ireland, United Kingdom, United States",0,...,0,0,0,0,0,0,0,0,0.0,19720820.0
4,M3gan,6.4,2022,12,PG-13,102.0,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...",United States,0,...,0,0,0,1,0,1,0,0,12000000.0,171253900.0


#### 1.6 --- Create Profit column

In [None]:
df['Profit_inf'] = df['Income_inf'] - df['Budget_inf']
mask = (df['Income_inf'] == 0) | (df['Budget_inf'] == 0)
df.loc[mask, 'Profit_inf'] = 0

In [None]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf,Profit_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,0,350000000.0,2267947000.0,1917947000.0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,0,35000000.0,108967.0,-34891030.0
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Japan, United States",0,...,0,0,0,0,1,0,0,85900000.0,239268600.0,153368600.0
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Ireland, United Kingdom, United States",0,...,0,0,0,0,0,0,0,0.0,19720820.0,0.0
4,M3gan,6.4,2022,12,PG-13,102.0,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...",United States,0,...,0,0,1,0,1,0,0,12000000.0,171253900.0,159253900.0


#### 1.7 --- calculate mean profit

In [None]:
mask = df['Profit_inf'] > 0
df_filtered = df[mask]
mean_profit = df_filtered['Profit_inf'].mean()

In [None]:
print(mean_profit)

218832012.97231796


## Problem.... movies w/ income > $218M will get a negative budget..

#### 1.8 --- Create ROI column and calculate mean_roi

In [None]:
df['ROI_inf'] = (df['Income_inf'] - df['Budget_inf']) / df['Income_inf']
mask = (df['Income_inf'] == 0) | (df['Budget_inf'] == 0)
df.loc[mask, 'ROI_inf'] = 0

In [None]:
#calc mean & median roi of the rows w/ values in both
mask = df['ROI_inf'] != 0
df_filtered = df[mask]
mean_roi = df_filtered['ROI_inf'].mean()
median_roi = df_filtered['ROI_inf'].median()


In [None]:
print(mean_roi)

-2.0455907232776336


In [None]:
print(median_roi)

0.647774754833504


In [None]:
df.to_csv('data.csv', index=False)


In [None]:
#why is mean roi negativ 200%? we´ll because some movies failed big and have big negative ROI

In [None]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf,Profit_inf,ROI_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,350000000.0,2267947000.0,1917947000.0,0.845675
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,35000000.0,108967.0,-34891030.0,-320.198161
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Japan, United States",0,...,0,0,0,1,0,0,85900000.0,239268600.0,153368600.0,0.640989
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Ireland, United Kingdom, United States",0,...,0,0,0,0,0,0,0.0,19720820.0,0.0,0.0
4,M3gan,6.4,2022,12,PG-13,102.0,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...",United States,0,...,0,1,0,1,0,0,12000000.0,171253900.0,159253900.0,0.929929


#### 1.9 -- replace 0 values in ROI_inf to median_ROI_inf

In [None]:
df['ROI_inf'] = df['ROI_inf'].replace(0, median_roi)

In [None]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf,Profit_inf,ROI_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,350000000.0,2267947000.0,1917947000.0,0.845675
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,35000000.0,108967.0,-34891030.0,-320.198161
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Japan, United States",0,...,0,0,0,1,0,0,85900000.0,239268600.0,153368600.0,0.640989
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Ireland, United Kingdom, United States",0,...,0,0,0,0,0,0,0.0,19720820.0,0.0,0.647775
4,M3gan,6.4,2022,12,PG-13,102.0,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...",United States,0,...,0,1,0,1,0,0,12000000.0,171253900.0,159253900.0,0.929929


In [None]:
# replace 0 and NaN values in the income_inf/budget_inf column with the median_roi
df.loc[df['Income_inf'].isna() | (df['Income_inf'] == 0), 'Income_inf'] = df['Budget_inf'] * (1 + median_roi)
df.loc[df['Budget_inf'].isna() | (df['Budget_inf'] == 0), 'Budget_inf'] = df['Income_inf'] / (1 + median_roi)


In [None]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf,Profit_inf,ROI_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,350000000.0,2267947000.0,1917947000.0,0.845675
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,35000000.0,108967.0,-34891030.0,-320.198161
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Japan, United States",0,...,0,0,0,1,0,0,85900000.0,239268600.0,153368600.0,0.640989
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Ireland, United Kingdom, United States",0,...,0,0,0,0,0,0,11968150.0,19720820.0,0.0,0.647775
4,M3gan,6.4,2022,12,PG-13,102.0,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...",United States,0,...,0,1,0,1,0,0,12000000.0,171253900.0,159253900.0,0.929929


#### 1.10 --- update profit_inf

In [None]:
df['Profit_inf'] = df['Income_inf'] - df['Budget_inf']

In [None]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf,Profit_inf,ROI_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,350000000.0,2267947000.0,1917947000.0,0.845675
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,35000000.0,108967.0,-34891030.0,-320.198161
2,Bullet Train,7.3,2022,8,R,127.0,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Japan, United States",0,...,0,0,0,1,0,0,85900000.0,239268600.0,153368600.0,0.640989
3,The Banshees of Inisherin,7.8,2022,11,R,114.0,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Ireland, United Kingdom, United States",0,...,0,0,0,0,0,0,11968150.0,19720820.0,7752668.0,0.647775
4,M3gan,6.4,2022,12,PG-13,102.0,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...",United States,0,...,0,1,0,1,0,0,12000000.0,171253900.0,159253900.0,0.929929


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1869 entries, 0 to 1999
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1869 non-null   object 
 1   Rating             1869 non-null   float64
 2   Year               1869 non-null   int64  
 3   Month              1869 non-null   int64  
 4   Certificate        1849 non-null   object 
 5   Runtime            1869 non-null   float64
 6   Directors          1869 non-null   object 
 7   Stars              1869 non-null   object 
 8   Country_of_origin  1869 non-null   object 
 9   top_50_director_1  1869 non-null   int64  
 10  top_50_director_2  1869 non-null   int64  
 11  top_1000_Stars_1   1869 non-null   int64  
 12  top_1000_Stars_2   1869 non-null   int64  
 13  top_1000_Stars_3   1869 non-null   int64  
 14  top_1000_Stars_4   1869 non-null   int64  
 15  genre_Action       1869 non-null   uint8  
 16  genre_Adventure    1869 

In [None]:
rows_post_budget = len(df)
col_post_budget = df.shape[1]
rows_post_income = len(df)
col_post_income = df.shape[1]

### 13.Country_of_origin

In [None]:
rows_pre_country_of_origin = len(df)
col_pre_country_of_origin = df.shape[1]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1869 entries, 0 to 1999
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1869 non-null   object 
 1   Rating             1869 non-null   float64
 2   Year               1869 non-null   int64  
 3   Month              1869 non-null   int64  
 4   Certificate        1849 non-null   object 
 5   Runtime            1869 non-null   float64
 6   Directors          1869 non-null   object 
 7   Stars              1869 non-null   object 
 8   Country_of_origin  1869 non-null   object 
 9   top_50_director_1  1869 non-null   int64  
 10  top_50_director_2  1869 non-null   int64  
 11  top_1000_Stars_1   1869 non-null   int64  
 12  top_1000_Stars_2   1869 non-null   int64  
 13  top_1000_Stars_3   1869 non-null   int64  
 14  top_1000_Stars_4   1869 non-null   int64  
 15  genre_Action       1869 non-null   uint8  
 16  genre_Adventure    1869 

In [None]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Country_of_origin,top_50_director_1,...,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Budget_inf,Income_inf,Profit_inf,ROI_inf
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",United States,1,...,0,0,0,0,0,0,350000000.0,2267947000.0,1917947000.0,0.845675
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","United States, Mexico, France",1,...,0,0,0,0,0,0,35000000.0,108967.0,-34891030.0,-320.198161


#### 13.1 --- Unique Values

In [None]:
unique_values_count = df['Country_of_origin'].str.split(', ').explode().value_counts()
print(len(unique_values_count))
print(unique_values_count)

61
United States     1675
United Kingdom     436
Canada             204
France             180
Germany            171
                  ... 
Jordan               1
Tunisia              1
Lebanon              1
Cyprus               1
Kenya                1
Name: Country_of_origin, Length: 61, dtype: int64


#### 13.2 Drop column

In [None]:
df = df.drop('Country_of_origin', axis=1)

In [None]:
#check it has been dropped
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1869 entries, 0 to 1999
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1869 non-null   object 
 1   Rating             1869 non-null   float64
 2   Year               1869 non-null   int64  
 3   Month              1869 non-null   int64  
 4   Certificate        1849 non-null   object 
 5   Runtime            1869 non-null   float64
 6   Directors          1869 non-null   object 
 7   Stars              1869 non-null   object 
 8   top_50_director_1  1869 non-null   int64  
 9   top_50_director_2  1869 non-null   int64  
 10  top_1000_Stars_1   1869 non-null   int64  
 11  top_1000_Stars_2   1869 non-null   int64  
 12  top_1000_Stars_3   1869 non-null   int64  
 13  top_1000_Stars_4   1869 non-null   int64  
 14  genre_Action       1869 non-null   uint8  
 15  genre_Adventure    1869 non-null   uint8  
 16  genre_Animation    1869 

In [None]:
rows_post_country_of_origin = len(df)
col_post_country_of_origin = df.shape[1]

## Certificate

In [None]:
rows_pre_certificate = len(df)
col_pre_certificate = df.shape[1]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1869 entries, 0 to 1999
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1869 non-null   object 
 1   Rating             1869 non-null   float64
 2   Year               1869 non-null   int64  
 3   Month              1869 non-null   int64  
 4   Certificate        1849 non-null   object 
 5   Runtime            1869 non-null   float64
 6   Directors          1869 non-null   object 
 7   Stars              1869 non-null   object 
 8   top_50_director_1  1869 non-null   int64  
 9   top_50_director_2  1869 non-null   int64  
 10  top_1000_Stars_1   1869 non-null   int64  
 11  top_1000_Stars_2   1869 non-null   int64  
 12  top_1000_Stars_3   1869 non-null   int64  
 13  top_1000_Stars_4   1869 non-null   int64  
 14  genre_Action       1869 non-null   uint8  
 15  genre_Adventure    1869 non-null   uint8  
 16  genre_Animation    1869 

#### 1 --- Unique Values

In [None]:
unique_values = df['Certificate'].unique()

In [None]:
unique_values_count = df['Certificate'].value_counts()

In [None]:
print(unique_values)

['PG-13' 'PG' 'R' 'TV-MA' 'Not Rated' nan 'TV-14' 'NC-17' 'Unrated' 'TV-G'
 'G']


In [None]:
print(unique_values_count)

R            831
PG-13        695
PG           214
Not Rated     55
G             21
TV-MA         20
NC-17          5
TV-14          4
Unrated        3
TV-G           1
Name: Certificate, dtype: int64


#### 1.2 --- Drop rows

In [None]:
df = df.drop(df[(df['Certificate'].isnull()) | (df['Certificate'] == 'Not Rated') | (df['Certificate'] == 'Unrated')].index)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1791 entries, 0 to 1999
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1791 non-null   object 
 1   Rating             1791 non-null   float64
 2   Year               1791 non-null   int64  
 3   Month              1791 non-null   int64  
 4   Certificate        1791 non-null   object 
 5   Runtime            1791 non-null   float64
 6   Directors          1791 non-null   object 
 7   Stars              1791 non-null   object 
 8   top_50_director_1  1791 non-null   int64  
 9   top_50_director_2  1791 non-null   int64  
 10  top_1000_Stars_1   1791 non-null   int64  
 11  top_1000_Stars_2   1791 non-null   int64  
 12  top_1000_Stars_3   1791 non-null   int64  
 13  top_1000_Stars_4   1791 non-null   int64  
 14  genre_Action       1791 non-null   uint8  
 15  genre_Adventure    1791 non-null   uint8  
 16  genre_Animation    1791 

In [None]:
unique_values_count_after_drop = df['Certificate'].value_counts()
print(unique_values_count_after_drop)

R        831
PG-13    695
PG       214
G         21
TV-MA     20
NC-17      5
TV-14      4
TV-G       1
Name: Certificate, dtype: int64


#### 1.3 --- one hot encoding

In [None]:
df = one_hot_encoding_column(df, "Certificate", prefix = "rated_")

In [None]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Directors,Stars,top_50_director_1,top_50_director_2,top_1000_Stars_1,...,Profit_inf,ROI_inf,rated_G,rated_NC-17,rated_PG,rated_PG-13,rated_R,rated_TV-14,rated_TV-G,rated_TV-MA
0,Avatar: The Way of Water,7.8,2022,12,192.0,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",1,0,1,...,1917947000.0,0.845675,0,0,0,1,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117.0,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...",1,0,1,...,-34891030.0,-320.198161,0,0,1,0,0,0,0,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1791 entries, 0 to 1999
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1791 non-null   object 
 1   Rating             1791 non-null   float64
 2   Year               1791 non-null   int64  
 3   Month              1791 non-null   int64  
 4   Runtime            1791 non-null   float64
 5   Directors          1791 non-null   object 
 6   Stars              1791 non-null   object 
 7   top_50_director_1  1791 non-null   int64  
 8   top_50_director_2  1791 non-null   int64  
 9   top_1000_Stars_1   1791 non-null   int64  
 10  top_1000_Stars_2   1791 non-null   int64  
 11  top_1000_Stars_3   1791 non-null   int64  
 12  top_1000_Stars_4   1791 non-null   int64  
 13  genre_Action       1791 non-null   uint8  
 14  genre_Adventure    1791 non-null   uint8  
 15  genre_Animation    1791 non-null   uint8  
 16  genre_Biography    1791 

In [None]:
rows_post_certificate = len(df)
col_post_certificate = df.shape[1]

In [None]:
#for display-porpuses at the end
#headers = ['title', 'rating', 'year', 'month', 'runtime', 'directors', 'stars', 'genre', ]
#rows_dropped = [0] * len(headers)

#for i in range(len(headers)):
#    pre_count = globals().get(f"rows_pre_{headers[i]}", 0)
#    post_count = globals().get(f"rows_post_{headers[i]}", 0)
#    rows_dropped[i] = pre_count - post_count
#    print(f"Rows dropped for {headers[i]}: {rows_dropped[i]}")

In [None]:
#df.to_csv('movies_processed.csv', index=False)

## Scraping more data from IMDb

### 1. Getting number of votes and metascore for each movie

In [None]:
searching_IMDb_movie('Title', df)

0. Avatar: The Way of Water
1. Guillermo del Toro's Pinocchio
2. Bullet Train
3. The Banshees of Inisherin
4. M3gan
5. Emancipation
6. Amsterdam
7. Violent Night
8. The Whale
9. The Fabelmans
10. The Menu
11. Babylon
12. X
13. Bones and All
14. Black Adam
15. Spirited
19. Black Panther: Wakanda Forever
20. Glass Onion: A Knives Out Mystery
21. Triangle of Sadness
22. Everything Everywhere All at Once
23. Emily the Criminal
24. God's Crooked Lines
25. Don't Worry Darling
27. Top Gun: Maverick
28. Smile
29. The Batman
31. Barbarian
32. She Said
33. Ticket to Paradise
35. Prey for the Devil
36. The Northman
37. The Pale Blue Eye
38. Puss in Boots: The Last Wish
40. Tár
46. Nope
47. Where the Crawdads Sing
48. Scream
49. Strange World
55. Pearl
56. The Woman King
59. White Noise
61. Thor: Love and Thunder
63. Empire of Light
65. Slumberland
66. Aftersun
67. Roald Dahl's Matilda the Musical
68. Devotion
70. See How They Run
71. The Stranger
72. Elvis
73. Weird: The Al Yankovic Story
74. The

2023-04-01 10:51:50,427 CRITICAL [imdbpy] /Users/vedabojar/opt/anaconda3/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Monster+Hunter&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>},); kwds: {}
Traceback (most recent call last):
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 523, in open
    response = meth(req, response)
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 632, in http_response
    response = self.parent.error(
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 561, in error
    return self._call_chain(*args)
  File "/Users/vedabojar/opt/anaco

263. Monster Hunter
264. Run Sweetheart Run
265. Fantasy Island
266. The Call of the Wild
267. Unhinged


2023-04-01 10:52:40,147 CRITICAL [imdbpy] /Users/vedabojar/opt/anaconda3/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=The+Princess+Switch%3A+Switched+Again&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>},); kwds: {}
Traceback (most recent call last):
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 523, in open
    response = meth(req, response)
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 632, in http_response
    response = self.parent.error(
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 561, in error
    return self._call_chain(*args)
  File "/Us

268. The Princess Switch: Switched Again
269. The Trial of the Chicago 7
270. The Rental
271. Honest Thief
272. Bill & Ted Face the Music


2023-04-01 10:53:35,832 CRITICAL [imdbpy] /Users/vedabojar/opt/anaconda3/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Fatale&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>},); kwds: {}
Traceback (most recent call last):
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 523, in open
    response = meth(req, response)
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 632, in http_response
    response = self.parent.error(
  File "/Users/vedabojar/opt/anaconda3/lib/python3.9/urllib/request.py", line 561, in error
    return self._call_chain(*args)
  File "/Users/vedabojar/opt/anaconda3/lib

273. Fatale
274. The Midnight Sky
276. Bad Boys for Life
278. The Wolf of Snow Hollow
279. I Care a Lot
281. Ammonite
282. Friendsgiving
285. Horizon Line
286. Demon Slayer the Movie: Mugen Train
287. The Dry
291. The Nest
292. The Empty Man
293. Zola
294. Onward
295. Shadow in the Cloud
298. Minari
300. Knives Out
301. Klaus
302. Last Christmas
303. Once Upon a Time in Hollywood
304. Midsommar
305. Avengers: Endgame
306. Joker
307. The Gentlemen
308. Parasite
309. The Addams Family
310. Little Women
313. Alita: Battle Angel
314. 1917
315. The Irishman
316. Star Wars: The Rise Of Skywalker
317. After
318. Terminator: Dark Fate
319. Ford v Ferrari
320. Shazam!
321. Five Feet Apart
322. Uncut Gems
323. Pinocchio
325. Jojo Rabbit
327. The Lighthouse
328. The King
329. Us
330. John Wick: Chapter 3 - Parabellum
331. Cats
332. Doctor Sleep
333. The Platform
334. Hustlers
335. Frozen II
336. Ad Astra
337. Aladdin
338. Ready or Not
339. Captain Marvel
340. Glass
341. Spider-Man: Far from Home


timeout: The read operation timed out

### 2. Fetching awards info, avg. movie rating and number of movie for the first director

In [None]:
# 1. Creating a new column 'director' with the first value of 'Directors'
df['director'] = df['Directors'].fillna('').str.split(',', expand=True)[0]

In [None]:
searching_IMDb_person('director', df, 'director')

### 3. Fetching awards info, avg. movie rating and number of movie for the first two actors

In [None]:
# 1. Creating a new column 'lead' with the first value of 'Stars'
df['lead'] = df['Stars'].fillna('').str.split(',', expand=True)[0]

In [None]:
searching_IMDb_person('lead', df, 'actor')

In [None]:
# 2. Creating a new column 'supporting' with the second value of 'Stars'
df['supporting'] = df['Stars'].fillna('').str.split(',', expand=True)[1]

In [None]:
searching_IMDb_person('supporting', df, 'actor')