### Importing Dependencies

In [1]:
import pandas as pd

### Loading Dataset

In [2]:
df = pd.read_csv("./data/movies.csv")
df_continents = pd.read_csv("./data/continents.csv")

### Functions

#### 1.1 --- check_non_numeric_values

In [3]:
def check_non_numeric_values(df, column):
    """Function takes in dataset and column. No kreturn, Printing out found non numeric values in the column."""

    # convert column to numeric data type
    numeric_col = pd.to_numeric(df[column], errors='coerce')

    # get the non-numeric values and their counts
    non_numeric_values = df[column][numeric_col.isna()].value_counts()

    # check if there are any non-numeric values
    if non_numeric_values.empty:
        print("No non numeric values in that column.")
    else:
        # create a table with non-numeric values and their counts
        non_numeric_table = pd.DataFrame({'Non-Numeric Value': non_numeric_values.index,
                                          'Count': non_numeric_values.values})

        # display the table
        print(non_numeric_table)

#### 2.1 --- convert_to_usd

In [4]:
def convert_to_usd(amount):
    amount.replace(' ', '')
    amount.replace('\xa0', '')
    if amount.startswith('$'):
        amount = amount.strip('$').replace(',', '')   # must remove commas
        return float(amount)   # convert str into float
    elif amount.startswith('€'):
        # Exchange rate for EUR to USD
        amount = amount.strip('€').replace(',', '')
        return float(amount) * 1.06 
    elif amount.startswith('¥'):
        # Exchange rate for YEN to USD
        amount = amount.strip('¥').replace(',', '')
        return float(amount) * 0.0075
    elif amount.startswith('₹'):
        # Exchange rate for RPL to USD
        amount = amount.strip('₹').replace(',', '')
        return float(amount) * 0.012 
    elif amount.startswith('SEK'):
        # Exchange rate for SEK to USD
        amount = amount.strip('SEK').replace(',', '')
        return float(amount) * 0.094
    elif amount.startswith('DKK'):
        # Exchange rate for RPL to USD
        amount = amount.strip('DKK').replace(',', '')
        return float(amount) * 0.14
    elif amount.startswith('£'):
        # Exchange rate for RPL to USD
        amount = amount.strip('£').replace(',', '')
        return float(amount) * 1.21  
    else:
        return None

#### 2.2 --- adjust_for_inflation

In [5]:
def adjust_for_inflation(df, column_name, year_column, new_column, drop_original=True):
    data = {
        "1990": 5.398,
        "1991": 4.235,
        "1992": 3.0288,
        "1993": 2.9517,
        "1994": 2.6074,
        "1995": 2.8054,
        "1996": 2.9312,
        "1997": 2.3377,
        "1998": 1.5523,
        "1999": 2.188,
        "2000": 3.3769,
        "2001": 2.8262,
        "2002": 1.586,
        "2003": 2.2701,
        "2004": 2.6772,
        "2005": 3.3927,
        "2006": 3.2259,
        "2007": 2.8527,
        "2008": 3.8391,
        "2009": -0.3555,
        "2010": 1.64,
        "2011": 3.1568,
        "2012": 2.0693,
        "2013": 1.4648,
        "2014": 1.6222,
        "2015": 0.1186,
        "2016": 1.2616,
        "2017": 2.1301,
        "2018": 2.4426,
        "2019": 1.8122,
        "2020": 1.2336,
        "2021": 4.6979
    }
    
    # Create a new column in the DataFrame to store the adjusted values
    df[new_column] = 0

    # Loop over the rows in the DataFrame
    for index, row in df.iterrows():
        # Get the year from the row
        year = row[year_column]
        # Skip the row if the value in the specified column is NaN
        if pd.isna(row[column_name]):
            continue
        # Get the inflation rate for each year from the dictionary
        inflation_rates = [data[str(yr)] for yr in range(year, 2022)]
        # Calculate the total inflation factor by multiplying the inflation rates together
        total_inflation_factor = 1
        for rate in inflation_rates:
            total_inflation_factor *= 1 + (rate / 100)
        # Get the value from the specified column
        value = row[column_name]
        # Adjust the value for inflation using the total inflation factor
        adjusted_value = value * total_inflation_factor
        # Round the result to two decimal places and store it in the new column
        df.at[index, new_column] = round(adjusted_value, 2)

    # Drop the original column if specified
    if drop_original:
        df = df.drop(columns=[column_name])

    # Return the DataFrame with the adjusted values
    return df


#### 3.1 --- one_hot_encoding_column

In [6]:
def one_hot_encoding_column(dataset, column, separator=", ", prefix=""):
    """
    Performs one-hot encoding on the specified column of the given dataset.
    dataset: The dataset to be processed.
    column: The name of the column to be one-hot encoded.
    separator: The separator used in the values of the specified column. Defaults to ",".
    prefix: Optional string to be added in front of each new column name. Defaults to "".
    returns: the new dataset with the specified column one-hot encoded.
    """

    # 1. Creating a list with all the values mentioned in the dataset
    value_list = [values.split(separator) for values in dataset[column]]

    # 2. Creating a set with value categories
    unique_v = {value for values in value_list for value in values}

    # 3. Performing one-hot encoding using get_dummies method
    value_subtable = pd.get_dummies(dataset[column].str.split(separator, expand=True).stack()).reset_index(level=1, drop=True)
    value_subtable = value_subtable.groupby(value_subtable.index).sum()

    # 4. Adding the prefix to the column names
    if prefix:
        value_subtable.columns = [prefix + str(col) for col in value_subtable.columns]

    # 5. Merging the subtable with the main dataset
    dataset_processed = pd.merge(dataset, value_subtable, left_index=True, right_index=True, how='left')
    dataset_processed.drop(columns=[column], inplace=True)

    # 6. Returning the new dataset
    return dataset_processed

#### 3.1 --- one_hot_coding_binary

In [7]:
def one_hot_coding_binary(dataset, original_column, prefix, file_column, file_location, separator=", ", num_categories=1, drop_original=True):
    if num_categories not in range(1,5):
        raise ValueError("num_categories must be between 1 and 4")

    for i in range(1, num_categories+1):
        dataset[f"{prefix}_no_{i}"] = dataset[original_column].str.split(separator, expand=True)[i-1]

    dataset_categories = pd.read_csv(file_location)

    for i in range(1, num_categories+1):
        replace = dataset[f"{prefix}_no_{i}"].isin(dataset_categories[file_column])
        dataset[f"{prefix}_no_{i}_binary"] = replace.astype(int)

    if drop_original:
        dataset.drop(columns=[original_column], inplace=True)

    if num_categories == 1:
        dataset.drop(columns=[f"{prefix}_no_1"], inplace=True)
        dataset.rename(columns={f"{prefix}_no_1_binary": f"{prefix}"}, inplace=True)
    else:
        for i in range(1, num_categories+1):
            dataset.drop(columns=[f"{prefix}_no_{i}"], inplace=True)
            dataset.rename(columns={f"{prefix}_no_{i}_binary": f"{prefix}_{i}"}, inplace=True)

        if num_categories == 5:
            dataset.rename(columns={f"{prefix}_all_binary": f"{prefix}_all"}, inplace=True)

    return dataset


### Dataset

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              2000 non-null   object 
 1   Rating             1999 non-null   float64
 2   Year               2000 non-null   int64  
 3   Month              2000 non-null   object 
 4   Certificate        1966 non-null   object 
 5   Runtime            2000 non-null   object 
 6   Directors          2000 non-null   object 
 7   Stars              2000 non-null   object 
 8   Genre              2000 non-null   object 
 9   Filming_location   2000 non-null   object 
 10  Budget             2000 non-null   object 
 11  Income             2000 non-null   object 
 12  Country_of_origin  2000 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 203.2+ KB


In [9]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin
0,Avatar: The Way of Water,7.8,2022,December,PG-13,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States
1,Guillermo del Toro's Pinocchio,7.6,2022,December,PG,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France"


### Title

#### Duplicates

In [10]:
duplicates = df[df.duplicated(['Title'], keep=False)].sort_values(by=['Title'])

In [11]:
print("Duplicates: " + str(len(duplicates)))

Duplicates: 22


In [12]:
duplicates

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin
1610,Black Christmas,4.6,2006,December,R,95,Glen Morgan,"Michelle Trachtenberg, Mary Elizabeth Winstead...",Horror,Canada,"$9,000,000","$21,510,851","Canada, United States"
348,Black Christmas,3.5,2019,December,PG-13,92,Sophia Takal,"Imogen Poots, Aleyse Shannon, Lily Donoghue, B...","Horror, Mystery, Thriller",New Zealand,"$5,000,000","$18,529,730","United States, New Zealand, Canada"
1746,Fantastic Four,5.7,2005,July,PG-13,106,Tim Story,"Ioan Gruffudd, Michael Chiklis, Chris Evans, J...","Action, Adventure, Fantasy",Canada,"$100,000,000","$333,535,934","Germany, United States"
767,Fantastic Four,4.3,2015,August,PG-13,100,Josh Trank,"Miles Teller, Kate Mara, Michael B Jordan, Jam...","Action, Adventure, Sci-Fi",USA,"$120,000,000","$167,882,881","United States, Germany, United Kingdom"
1263,Frozen,6.2,2010,March,R,93,Adam Green,"Shawn Ashmore, Emma Bell, Kevin Zegers, Ed Ack...","Adventure, Drama, Mystery",USA,Unknown,"$3,843,774",United States
903,Frozen,7.4,2013,November,PG,102,"Chris Buck, Jennifer Lee","Kristen Bell, Idina Menzel, Jonathan Groff, Jo...","Animation, Adventure, Comedy",Norge,"$150,000,000","$1,304,550,716",United States
357,Hellboy,5.2,2019,April,R,120,Neil Marshall,"David Harbour, Milla Jovovich, Ian McShane, Sa...","Action, Adventure, Fantasy",Bulgaria,"$50,000,000","$55,065,289","United States, United Kingdom, Bulgaria, Canad..."
1843,Hellboy,6.8,2004,April,PG-13,122,Guillermo del Toro,"Ron Perlman, Doug Jones, Selma Blair, John Hurt","Action, Adventure, Fantasy",Czech Republic,"$66,000,000","$99,378,985",United States
1905,Oldboy,8.4,2003,November,R,120,Park Chan wook,"Choi Min sik, Yoo Ji tae, Kang Hye jeong, Kim ...","Action, Drama, Mystery",South Korea,"$3,000,000","$15,421,226",South Korea
920,Oldboy,5.8,2013,November,R,104,Spike Lee,"Josh Brolin, Elizabeth Olsen, Samuel L Jackson...","Action, Drama, Mystery",USA,"$30,000,000","$5,186,767",United States


### Rating

#### Missing values

In [13]:
#dropping all rows that have missing values in the column Rating
df = df.dropna(subset=['Rating'])

In [14]:
#A quick check to see that the row was removed from the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1999 non-null   object 
 1   Rating             1999 non-null   float64
 2   Year               1999 non-null   int64  
 3   Month              1999 non-null   object 
 4   Certificate        1965 non-null   object 
 5   Runtime            1999 non-null   object 
 6   Directors          1999 non-null   object 
 7   Stars              1999 non-null   object 
 8   Genre              1999 non-null   object 
 9   Filming_location   1999 non-null   object 
 10  Budget             1999 non-null   object 
 11  Income             1999 non-null   object 
 12  Country_of_origin  1999 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 218.6+ KB


### Year

#### Outliers

In [15]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Year']
count = column_values.between(2002, 2023).sum()

### Month

#### Unqiue values

In [16]:
# Check all unique values in the column Month
column_values = df['Month'].unique()
print(column_values)

['December' 'August' 'November' 'October' 'March' 'September' 'May'
 'April' 'January' 'July' 'June' 'February' '2014' '2008']


In [17]:
# Count the number of times a specific string value occurs in a column
count_2014 = df['Month'].value_counts()['2014']
print("2014: " + str(count_2014))
# Count the number of times a specific string value occurs in a column
count_2008 = df['Month'].value_counts()['2008']
print("2008: " + str(count_2008))

2014: 1
2008: 1


In [18]:
#Drop non Month-values
df = df.drop(index=df.loc[df['Month'] == '2014'].index)
df = df.drop(index=df.loc[df['Month'] == '2008'].index)

In [19]:
#checking the rows have been dropped
df.info()
column_values = df['Month'].unique()
print(column_values)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1997 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1997 non-null   object 
 1   Rating             1997 non-null   float64
 2   Year               1997 non-null   int64  
 3   Month              1997 non-null   object 
 4   Certificate        1964 non-null   object 
 5   Runtime            1997 non-null   object 
 6   Directors          1997 non-null   object 
 7   Stars              1997 non-null   object 
 8   Genre              1997 non-null   object 
 9   Filming_location   1997 non-null   object 
 10  Budget             1997 non-null   object 
 11  Income             1997 non-null   object 
 12  Country_of_origin  1997 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 218.4+ KB
['December' 'August' 'November' 'October' 'March' 'September' 'May'
 'April' 'January' 'July' 'June' 'Februa

#### Converting non-numeric to numeric values

In [20]:
# Define a dictionary to map months to integers
month_to_int = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
                'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

# Apply the map() method to convert the values
df['Month'] = df['Month'].map(month_to_int)

# Convert the type of the column to int
df['Month'] = df['Month'].astype(int)

In [21]:
# Check the new column
column_values = df['Month'].unique()
column_values.sort()
print("Unique values: " + str(column_values))
df.info()

Unique values: [ 1  2  3  4  5  6  7  8  9 10 11 12]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1997 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1997 non-null   object 
 1   Rating             1997 non-null   float64
 2   Year               1997 non-null   int64  
 3   Month              1997 non-null   int32  
 4   Certificate        1964 non-null   object 
 5   Runtime            1997 non-null   object 
 6   Directors          1997 non-null   object 
 7   Stars              1997 non-null   object 
 8   Genre              1997 non-null   object 
 9   Filming_location   1997 non-null   object 
 10  Budget             1997 non-null   object 
 11  Income             1997 non-null   object 
 12  Country_of_origin  1997 non-null   object 
dtypes: float64(1), int32(1), int64(1), object(10)
memory usage: 210.6+ KB


### Certificate

#### Unqiue values

In [22]:
unique_values = df['Certificate'].unique()
print(unique_values)
unique_values_count = df['Certificate'].value_counts()
print(unique_values_count)

['PG-13' 'PG' 'R' 'TV-14' 'TV-MA' 'TV-PG' 'TV-Y7' 'Not Rated' nan 'NC-17'
 'TV-G' 'Unrated' 'G']
R            867
PG-13        712
PG           225
Not Rated     61
TV-MA         40
G             22
TV-14         12
TV-PG          9
NC-17          6
Unrated        6
TV-G           3
TV-Y7          1
Name: Certificate, dtype: int64


In [23]:
#Drop 'Not Rated', 'Untated' and missing values
df = df.dropna(subset=['Certificate'])
df = df[~df['Certificate'].isin(['Not Rated', 'Unrated'])]

In [24]:
# Check that the rows have been dropped
unique_values = df['Certificate'].unique()
print(unique_values)
unique_values_count = df['Certificate'].value_counts()
print(unique_values_count)
df.info()

['PG-13' 'PG' 'R' 'TV-14' 'TV-MA' 'TV-PG' 'TV-Y7' 'NC-17' 'TV-G' 'G']
R        867
PG-13    712
PG       225
TV-MA     40
G         22
TV-14     12
TV-PG      9
NC-17      6
TV-G       3
TV-Y7      1
Name: Certificate, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1897 non-null   object 
 1   Rating             1897 non-null   float64
 2   Year               1897 non-null   int64  
 3   Month              1897 non-null   int32  
 4   Certificate        1897 non-null   object 
 5   Runtime            1897 non-null   object 
 6   Directors          1897 non-null   object 
 7   Stars              1897 non-null   object 
 8   Genre              1897 non-null   object 
 9   Filming_location   1897 non-null   object 
 10  Budget             1897 non-null   object 
 11  Income             1897 non-null

#### Convert Certificate to numeric with One hot encoding

In [25]:
df = one_hot_encoding_column(df, "Certificate", prefix="cert_")

In [26]:
#Check the result
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897 entries, 0 to 1999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1897 non-null   object 
 1   Rating             1897 non-null   float64
 2   Year               1897 non-null   int64  
 3   Month              1897 non-null   int32  
 4   Runtime            1897 non-null   object 
 5   Directors          1897 non-null   object 
 6   Stars              1897 non-null   object 
 7   Genre              1897 non-null   object 
 8   Filming_location   1897 non-null   object 
 9   Budget             1897 non-null   object 
 10  Income             1897 non-null   object 
 11  Country_of_origin  1897 non-null   object 
 12  cert_G             1897 non-null   uint8  
 13  cert_NC-17         1897 non-null   uint8  
 14  cert_PG            1897 non-null   uint8  
 15  cert_PG-13         1897 non-null   uint8  
 16  cert_R             1897 

In [27]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Directors,Stars,Genre,Filming_location,Budget,...,cert_G,cert_NC-17,cert_PG,cert_PG-13,cert_R,cert_TV-14,cert_TV-G,cert_TV-MA,cert_TV-PG,cert_TV-Y7
0,Avatar: The Way of Water,7.8,2022,12,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000",...,0,0,0,1,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000",...,0,0,1,0,0,0,0,0,0,0


### Runtime

In [28]:
#Checking non numeric values
check_non_numeric_values(df, "Runtime")

No non numeric values in that column.


In [29]:
# Convert to integers
df['Runtime'] = df['Runtime'].astype(int)

In [30]:
#Check the result
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897 entries, 0 to 1999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1897 non-null   object 
 1   Rating             1897 non-null   float64
 2   Year               1897 non-null   int64  
 3   Month              1897 non-null   int32  
 4   Runtime            1897 non-null   int32  
 5   Directors          1897 non-null   object 
 6   Stars              1897 non-null   object 
 7   Genre              1897 non-null   object 
 8   Filming_location   1897 non-null   object 
 9   Budget             1897 non-null   object 
 10  Income             1897 non-null   object 
 11  Country_of_origin  1897 non-null   object 
 12  cert_G             1897 non-null   uint8  
 13  cert_NC-17         1897 non-null   uint8  
 14  cert_PG            1897 non-null   uint8  
 15  cert_PG-13         1897 non-null   uint8  
 16  cert_R             1897 

### Directors

In [31]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Directors,Stars,Genre,Filming_location,Budget,...,cert_G,cert_NC-17,cert_PG,cert_PG-13,cert_R,cert_TV-14,cert_TV-G,cert_TV-MA,cert_TV-PG,cert_TV-Y7
0,Avatar: The Way of Water,7.8,2022,12,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000",...,0,0,0,1,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000",...,0,0,1,0,0,0,0,0,0,0


#### Convert Directors to numeric with One hot encoding

In [32]:
num_directors = 2
prefix_col = "top_50_director"
df = one_hot_coding_binary(df, "Directors", prefix_col, "Name", "./data/top_50_directors.csv", num_categories=num_directors, drop_original=True)

In [33]:
#Check the result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Stars,Genre,Filming_location,Budget,Income,...,cert_PG,cert_PG-13,cert_R,cert_TV-14,cert_TV-G,cert_TV-MA,cert_TV-PG,cert_TV-Y7,top_50_director_1,top_50_director_2
0,Avatar: The Way of Water,7.8,2022,12,192,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",...,0,1,0,0,0,0,0,0,1,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967",...,1,0,0,0,0,0,0,0,1,0


#### Stars

In [34]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Stars,Genre,Filming_location,Budget,Income,...,cert_PG,cert_PG-13,cert_R,cert_TV-14,cert_TV-G,cert_TV-MA,cert_TV-PG,cert_TV-Y7,top_50_director_1,top_50_director_2
0,Avatar: The Way of Water,7.8,2022,12,192,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",...,0,1,0,0,0,0,0,0,1,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967",...,1,0,0,0,0,0,0,0,1,0


#### Convert Stars to numeric with One hot encoding

In [35]:
num_stars = 4
prefix_col = "top_1000_Stars"
df = one_hot_coding_binary(df, "Stars", prefix_col, "Name", "./data/top_1000_actors.csv", num_categories=num_stars, drop_original=True)

In [36]:
#Check the result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Genre,Filming_location,Budget,Income,Country_of_origin,...,cert_TV-G,cert_TV-MA,cert_TV-PG,cert_TV-Y7,top_50_director_1,top_50_director_2,top_1000_Stars_1,top_1000_Stars_2,top_1000_Stars_3,top_1000_Stars_4
0,Avatar: The Way of Water,7.8,2022,12,192,"Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States,...,0,0,0,0,1,0,1,1,1,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France",...,0,0,0,0,1,0,1,0,0,0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897 entries, 0 to 1999
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1897 non-null   object 
 1   Rating             1897 non-null   float64
 2   Year               1897 non-null   int64  
 3   Month              1897 non-null   int32  
 4   Runtime            1897 non-null   int32  
 5   Genre              1897 non-null   object 
 6   Filming_location   1897 non-null   object 
 7   Budget             1897 non-null   object 
 8   Income             1897 non-null   object 
 9   Country_of_origin  1897 non-null   object 
 10  cert_G             1897 non-null   uint8  
 11  cert_NC-17         1897 non-null   uint8  
 12  cert_PG            1897 non-null   uint8  
 13  cert_PG-13         1897 non-null   uint8  
 14  cert_R             1897 non-null   uint8  
 15  cert_TV-14         1897 non-null   uint8  
 16  cert_TV-G          1897 

### Genre

#### Convert Genre to numeric with One hot encoding

In [38]:
df = one_hot_encoding_column(df, "Genre", prefix = "genre_")

In [39]:
#Check result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Filming_location,Budget,Income,Country_of_origin,cert_G,...,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western
0,Avatar: The Way of Water,7.8,2022,12,192,New Zealand,"$350,000,000","$2,267,946,983",United States,0,...,0,0,0,0,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,USA,"$35,000,000","$108,967","United States, Mexico, France",0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897 entries, 0 to 1999
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1897 non-null   object 
 1   Rating             1897 non-null   float64
 2   Year               1897 non-null   int64  
 3   Month              1897 non-null   int32  
 4   Runtime            1897 non-null   int32  
 5   Filming_location   1897 non-null   object 
 6   Budget             1897 non-null   object 
 7   Income             1897 non-null   object 
 8   Country_of_origin  1897 non-null   object 
 9   cert_G             1897 non-null   uint8  
 10  cert_NC-17         1897 non-null   uint8  
 11  cert_PG            1897 non-null   uint8  
 12  cert_PG-13         1897 non-null   uint8  
 13  cert_R             1897 non-null   uint8  
 14  cert_TV-14         1897 non-null   uint8  
 15  cert_TV-G          1897 non-null   uint8  
 16  cert_TV-MA         1897 

### Filming location

In [41]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Filming_location,Budget,Income,Country_of_origin,cert_G,...,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western
0,Avatar: The Way of Water,7.8,2022,12,192,New Zealand,"$350,000,000","$2,267,946,983",United States,0,...,0,0,0,0,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,USA,"$35,000,000","$108,967","United States, Mexico, France",0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
#### Unique values
unique_values = df['Filming_location'].unique()
print(unique_values)
unique_values_count = df['Filming_location'].value_counts()
print(unique_values_count)

['New Zealand' 'USA' 'Japan' 'Ireland' 'Unknown' 'Canada' 'UK' 'Norway'
 'Greece' 'Mexico' 'Spain' 'Italy' 'Bulgaria' 'Australia' 'Iceland'
 'Belgium' 'Germany' 'Turkey' 'South Africa' 'France' 'Egypt' 'Malta'
 'Indonesia' 'Jordan' 'Finland' 'Panama' 'Georgia' 'Dominican Republic'
 'Romania' 'Morocco' 'Saudi Arabia' 'Taiwan' 'Vietnam' 'United Kingdom'
 'Poland' 'Estonia' 'Netherlands' 'Bangladesh' 'Czech Republic' 'Guam'
 'Fiji' 'South Korea' 'Hungary' 'Serbia' 'Colombia' 'Argentina' 'Slovakia'
 'Singapore' 'Croatia' 'Denmark' 'Lebanon' 'Sweden' 'Bolivia'
 'The Netherlands' 'United Arab Emirates' 'Polynesia' 'Nepal'
 'Puerto Rico' 'Gabon' 'China' 'India' 'Namibia' 'Chile' 'Venezuela'
 'Switzerland' 'Thailand' 'Russia' 'Costa Rica' 'Austria' 'Norge' 'Space'
 'Peru' 'Michigan USA' 'Brazil' 'Iran' 'Monaco' 'Guatemala'
 'Official Facebook' 'French Polynesia' 'Vatican City' 'Luxembourg'
 'Slovenia' 'Israel' 'Lithuania' 'Ontario' 'Malaysia' 'Uruguay'
 'St Vincent and the Grenadines' 'Uganda'

#### Dropping column

In [43]:
df = df.drop('Filming_location', axis=1)

In [44]:
#Check the result
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897 entries, 0 to 1999
Data columns (total 44 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1897 non-null   object 
 1   Rating             1897 non-null   float64
 2   Year               1897 non-null   int64  
 3   Month              1897 non-null   int32  
 4   Runtime            1897 non-null   int32  
 5   Budget             1897 non-null   object 
 6   Income             1897 non-null   object 
 7   Country_of_origin  1897 non-null   object 
 8   cert_G             1897 non-null   uint8  
 9   cert_NC-17         1897 non-null   uint8  
 10  cert_PG            1897 non-null   uint8  
 11  cert_PG-13         1897 non-null   uint8  
 12  cert_R             1897 non-null   uint8  
 13  cert_TV-14         1897 non-null   uint8  
 14  cert_TV-G          1897 non-null   uint8  
 15  cert_TV-MA         1897 non-null   uint8  
 16  cert_TV-PG         1897 

### Country of Origin

In [45]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Budget,Income,Country_of_origin,cert_G,cert_NC-17,...,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western
0,Avatar: The Way of Water,7.8,2022,12,192,"$350,000,000","$2,267,946,983",United States,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"$35,000,000","$108,967","United States, Mexico, France",0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
#### Unique values
unique_values = df['Country_of_origin'].unique()
print(unique_values)
unique_values_count = df['Country_of_origin'].value_counts()
print(unique_values_count)

['United States' 'United States, Mexico, France' 'Japan, United States'
 'Ireland, United Kingdom, United States' 'United States, Japan'
 'United States, Canada' 'Italy, United States'
 'United States, Canada, New Zealand, Hungary'
 'United Kingdom, United States' 'United States, Mexico, Hungary, Canada'
 'Norway'
 'Sweden, France, United Kingdom, Germany, Turkey, Greece, United States, Denmark, Switzerland, Mexico'
 'Spain' 'United States, United Kingdom'
 'United States, China, United Kingdom'
 'Germany, United States, United Kingdom' 'United States, Canada, Japan'
 'Australia, United States' 'Australia' 'United States, Australia'
 'United States, Czech Republic' 'Mexico' 'Spain, United States'
 'United States, China, Malta' 'Indonesia' 'Jordan, Saudi Arabia, Sweden'
 'United Kingdom, France' 'United Kingdom, China'
 'United States, Japan, China' 'United States, Mexico, Canada'
 'Norway, France, Sweden, Denmark' 'Sweden, Netherlands, France'
 'France, Canada, United States' 'United K

In [47]:
# split the values based on the comma and space separator, and explode them into individual rows
unique_country = df['Country_of_origin'].str.split(', ', expand=True).stack()
counts = unique_country.value_counts()
print(counts)

United States     1749
United Kingdom     432
Canada             206
Germany            169
France             167
                  ... 
Cyprus               1
Lebanon              1
Tunisia              1
Portugal             1
Kenya                1
Length: 63, dtype: int64


#### Converting Country_of_origin to Continent_of_origin

In [48]:
# Create a dictionary to map countries to continents
country_to_continent = {}
for index, row in df_continents.iterrows():
    country_to_continent[row["Country"]] = row["Continent"]

# Define a function to apply the mapping to each row in the column
def countries_to_continents(countries):
    continents = set()
    for country in countries.split(", "):
        if country in country_to_continent:
            continents.add(country_to_continent[country])
        else:
            continents.add("Unknown")
    return ", ".join(sorted(list(continents)))

# Apply the function to the column
df["Country_of_continent"] = df["Country_of_origin"].apply(countries_to_continents)

In [49]:
#Check result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Budget,Income,Country_of_origin,cert_G,cert_NC-17,...,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Country_of_continent
0,Avatar: The Way of Water,7.8,2022,12,192,"$350,000,000","$2,267,946,983",United States,0,0,...,0,0,0,0,0,0,0,0,0,North America
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"$35,000,000","$108,967","United States, Mexico, France",0,0,...,0,0,0,0,0,0,0,0,0,"Europe, North America"


In [50]:
#Check for Unknows value
value_counts = df["Country_of_continent"].value_counts()

# Count the number of times "unknown" appears in the Country_of_origin column
unknown_count = df["Country_of_continent"].apply(lambda x: "Unknown" in x).sum()
unknown_rows = df[df["Country_of_continent"].str.contains("Unknown", regex=False)]

# Print the result
print(f"The number of times 'unknown' appears in Country_of_continent is: {unknown_count}")
unknown_rows.head()

The number of times 'unknown' appears in Country_of_continent is: 0


Unnamed: 0,Title,Rating,Year,Month,Runtime,Budget,Income,Country_of_origin,cert_G,cert_NC-17,...,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,Country_of_continent


#### Convert Contry of continent to numeric with one hot encoding

In [51]:
df = one_hot_encoding_column(df, "Country_of_continent", prefix = "contient_")

In [52]:
#Check result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Budget,Income,Country_of_origin,cert_G,cert_NC-17,...,genre_Sport,genre_Thriller,genre_War,genre_Western,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America
0,Avatar: The Way of Water,7.8,2022,12,192,"$350,000,000","$2,267,946,983",United States,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"$35,000,000","$108,967","United States, Mexico, France",0,0,...,0,0,0,0,0,0,1,1,0,0


In [53]:
# Drop original column
df = df.drop('Country_of_origin', axis=1)

In [54]:
#Check the result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Budget,Income,cert_G,cert_NC-17,cert_PG,...,genre_Sport,genre_Thriller,genre_War,genre_Western,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America
0,Avatar: The Way of Water,7.8,2022,12,192,"$350,000,000","$2,267,946,983",0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"$35,000,000","$108,967",0,0,1,...,0,0,0,0,0,0,1,1,0,0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897 entries, 0 to 1999
Data columns (total 49 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Title                   1897 non-null   object 
 1   Rating                  1897 non-null   float64
 2   Year                    1897 non-null   int64  
 3   Month                   1897 non-null   int32  
 4   Runtime                 1897 non-null   int32  
 5   Budget                  1897 non-null   object 
 6   Income                  1897 non-null   object 
 7   cert_G                  1897 non-null   uint8  
 8   cert_NC-17              1897 non-null   uint8  
 9   cert_PG                 1897 non-null   uint8  
 10  cert_PG-13              1897 non-null   uint8  
 11  cert_R                  1897 non-null   uint8  
 12  cert_TV-14              1897 non-null   uint8  
 13  cert_TV-G               1897 non-null   uint8  
 14  cert_TV-MA              1897 non-null   

### Income / Budget

In [56]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Budget,Income,cert_G,cert_NC-17,cert_PG,...,genre_Sport,genre_Thriller,genre_War,genre_Western,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America
0,Avatar: The Way of Water,7.8,2022,12,192,"$350,000,000","$2,267,946,983",0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,"$35,000,000","$108,967",0,0,1,...,0,0,0,0,0,0,1,1,0,0


#### Convert to USD and strip of non numeric characters

In [57]:
df['Budget'] = df['Budget'].apply(convert_to_usd)
df['Income'] = df['Income'].apply(convert_to_usd)

In [58]:
#Check result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,Budget,Income,cert_G,cert_NC-17,cert_PG,...,genre_Sport,genre_Thriller,genre_War,genre_Western,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America
0,Avatar: The Way of Water,7.8,2022,12,192,350000000.0,2267947000.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,35000000.0,108967.0,0,0,1,...,0,0,0,0,0,0,1,1,0,0


#### Calculation Inflation

In [59]:
df = adjust_for_inflation(df, "Budget", "Year", "Budget_inf", drop_original=True)
df = adjust_for_inflation(df, "Income", "Year", "Income_inf", drop_original=True)

In [60]:
#Check result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,cert_G,cert_NC-17,cert_PG,cert_PG-13,cert_R,...,genre_War,genre_Western,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America,Budget_inf,Income_inf
0,Avatar: The Way of Water,7.8,2022,12,192,0,0,0,1,0,...,0,0,0,0,0,1,0,0,350000000,2267946983
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,0,0,1,0,0,...,0,0,0,0,1,1,0,0,35000000,108967


#### Missing Values

In [61]:
#Since the missing values are 0 instead of NaN at this point (and 0 would also be treated like missing value )

# count the number of occurrences of 0 in col1
count_col1 = (df['Budget_inf'] == 0).sum()

# count the number of occurrences of 0 in col2
count_col2 = (df['Income_inf'] == 0).sum()

# count the number of occurrences of 0 in both col1 and col2
count_both = ((df['Budget_inf'] == 0) & (df['Income_inf'] == 0)).sum()

# print the results
print('Number of zeros in col1:', count_col1)
print('Number of zeros in col2:', count_col2)
print('Number of zeros in both col1 and col2:', count_both)

Number of zeros in col1: 258
Number of zeros in col2: 118
Number of zeros in both col1 and col2: 87


In [62]:
#Dropping rows with missing values in both columns
df = df[(df['Budget_inf'] != 0) | (df['Income_inf'] != 0)]

In [63]:
#Check result
count_both = ((df['Budget_inf'] == 0) & (df['Income_inf'] == 0)).sum()
print('Number of zeros in both col1 and col2:', count_both)

Number of zeros in both col1 and col2: 0


#### Create Profit column

In [64]:
df['Profit_inf'] = df['Income_inf'] - df['Budget_inf']
mask = (df['Income_inf'] == 0) | (df['Budget_inf'] == 0)
df.loc[mask, 'Profit_inf'] = 0

In [65]:
#Check result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,cert_G,cert_NC-17,cert_PG,cert_PG-13,cert_R,...,genre_Western,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America,Budget_inf,Income_inf,Profit_inf
0,Avatar: The Way of Water,7.8,2022,12,192,0,0,0,1,0,...,0,0,0,0,1,0,0,350000000,2267946983,1917946983
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,0,0,1,0,0,...,0,0,0,1,1,0,0,35000000,108967,-34891033


#### Create ROI column

In [66]:
df['ROI_inf'] = (df['Income_inf'] - df['Budget_inf']) / df['Income_inf']
mask = (df['Income_inf'] == 0) | (df['Budget_inf'] == 0)
df.loc[mask, 'ROI_inf'] = 0

In [67]:
#calc mean & median roi of the rows w/ values in both
mask = df['ROI_inf'] != 0
df_filtered = df[mask]
mean_roi = df_filtered['ROI_inf'].mean()
median_roi = df_filtered['ROI_inf'].median()

In [68]:
print("Mean_roi: " + str(mean_roi))
print("Median_roi: " + str(median_roi))

Mean_roi: -5.735946163355476
Median_roi: 0.6490712151065096


#### Replace missing values with mean ROI

In [69]:
df['ROI_inf'] = df['ROI_inf'].replace(0, median_roi)

In [70]:
#Check result
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Runtime,cert_G,cert_NC-17,cert_PG,cert_PG-13,cert_R,...,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America,Budget_inf,Income_inf,Profit_inf,ROI_inf
0,Avatar: The Way of Water,7.8,2022,12,192,0,0,0,1,0,...,0,0,0,1,0,0,350000000,2267946983,1917946983,0.845675
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,0,0,1,0,0,...,0,0,1,1,0,0,35000000,108967,-34891033,-320.198161


#### Replace missing values in Budget and Income

In [71]:
df.loc[df['Income_inf'].isna() | (df['Income_inf'] == 0), 'Income_inf'] = df['Budget_inf'] * (1 + median_roi)
df.loc[df['Budget_inf'].isna() | (df['Budget_inf'] == 0), 'Budget_inf'] = df['Income_inf'] / (1 + median_roi)

In [72]:
#Check result
df.head()

Unnamed: 0,Title,Rating,Year,Month,Runtime,cert_G,cert_NC-17,cert_PG,cert_PG-13,cert_R,...,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America,Budget_inf,Income_inf,Profit_inf,ROI_inf
0,Avatar: The Way of Water,7.8,2022,12,192,0,0,0,1,0,...,0,0,0,1,0,0,350000000.0,2267947000.0,1917946983,0.845675
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,0,0,1,0,0,...,0,0,1,1,0,0,35000000.0,108967.0,-34891033,-320.198161
2,Bullet Train,7.3,2022,8,127,0,0,0,0,1,...,0,1,0,1,0,0,85900000.0,239268600.0,153368602,0.640989
3,The Banshees of Inisherin,7.8,2022,11,114,0,0,0,0,1,...,0,0,1,1,0,0,11958750.0,19720820.0,0,0.649071
4,M3gan,6.4,2022,12,102,0,0,0,1,0,...,0,0,0,1,0,0,12000000.0,171253900.0,159253910,0.929929


In [73]:
#### Update Profit
df['Profit_inf'] = df['Income_inf'] - df['Budget_inf']

In [74]:
#Check result
df.head()

Unnamed: 0,Title,Rating,Year,Month,Runtime,cert_G,cert_NC-17,cert_PG,cert_PG-13,cert_R,...,contient_Africa,contient_Asia,contient_Europe,contient_North America,contient_Oceania,contient_South America,Budget_inf,Income_inf,Profit_inf,ROI_inf
0,Avatar: The Way of Water,7.8,2022,12,192,0,0,0,1,0,...,0,0,0,1,0,0,350000000.0,2267947000.0,1917947000.0,0.845675
1,Guillermo del Toro's Pinocchio,7.6,2022,12,117,0,0,1,0,0,...,0,0,1,1,0,0,35000000.0,108967.0,-34891030.0,-320.198161
2,Bullet Train,7.3,2022,8,127,0,0,0,0,1,...,0,1,0,1,0,0,85900000.0,239268600.0,153368600.0,0.640989
3,The Banshees of Inisherin,7.8,2022,11,114,0,0,0,0,1,...,0,0,1,1,0,0,11958750.0,19720820.0,7762077.0,0.649071
4,M3gan,6.4,2022,12,102,0,0,0,1,0,...,0,0,0,1,0,0,12000000.0,171253900.0,159253900.0,0.929929


#### Check df after dataprocessing

In [75]:
#Remove Title
df.drop('Title', axis=1, inplace=True)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1810 entries, 0 to 1999
Data columns (total 50 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Rating                  1810 non-null   float64
 1   Year                    1810 non-null   int64  
 2   Month                   1810 non-null   int32  
 3   Runtime                 1810 non-null   int32  
 4   cert_G                  1810 non-null   uint8  
 5   cert_NC-17              1810 non-null   uint8  
 6   cert_PG                 1810 non-null   uint8  
 7   cert_PG-13              1810 non-null   uint8  
 8   cert_R                  1810 non-null   uint8  
 9   cert_TV-14              1810 non-null   uint8  
 10  cert_TV-G               1810 non-null   uint8  
 11  cert_TV-MA              1810 non-null   uint8  
 12  cert_TV-PG              1810 non-null   uint8  
 13  cert_TV-Y7              1810 non-null   uint8  
 14  top_50_director_1       1810 non-null   

In [77]:
#df.to_csv('01_dataprocessing_noTitle.csv', index=False)