### Importing Dependencies

In [None]:
import pandas as pd

### Loading Dataset

In [None]:
df = pd.read_csv("../data/movies.csv")
df_continents = pd.read_csv("../data/continents.csv")

### Functions

#### 1.1 --- check_non_numeric_values

In [None]:
def check_non_numeric_values(df, column):
    """Function takes in dataset and column. No kreturn, Printing out found non numeric values in the column."""

    # convert column to numeric data type
    numeric_col = pd.to_numeric(df[column], errors='coerce')

    # get the non-numeric values and their counts
    non_numeric_values = df[column][numeric_col.isna()].value_counts()

    # check if there are any non-numeric values
    if non_numeric_values.empty:
        print("No non numeric values in that column.")
    else:
        # create a table with non-numeric values and their counts
        non_numeric_table = pd.DataFrame({'Non-Numeric Value': non_numeric_values.index,
                                          'Count': non_numeric_values.values})

        # display the table
        print(non_numeric_table)

#### 2.1 --- convert_to_usd

In [None]:
def convert_to_usd(amount):
    amount.replace(' ', '')
    amount.replace('\xa0', '')
    if amount.startswith('$'):
        amount = amount.strip('$').replace(',', '')   # must remove commas
        return float(amount)   # convert str into float
    elif amount.startswith('€'):
        # Exchange rate for EUR to USD
        amount = amount.strip('€').replace(',', '')
        return float(amount) * 1.06 
    elif amount.startswith('¥'):
        # Exchange rate for YEN to USD
        amount = amount.strip('¥').replace(',', '')
        return float(amount) * 0.0075
    elif amount.startswith('₹'):
        # Exchange rate for RPL to USD
        amount = amount.strip('₹').replace(',', '')
        return float(amount) * 0.012 
    elif amount.startswith('SEK'):
        # Exchange rate for SEK to USD
        amount = amount.strip('SEK').replace(',', '')
        return float(amount) * 0.094
    elif amount.startswith('DKK'):
        # Exchange rate for RPL to USD
        amount = amount.strip('DKK').replace(',', '')
        return float(amount) * 0.14
    elif amount.startswith('£'):
        # Exchange rate for RPL to USD
        amount = amount.strip('£').replace(',', '')
        return float(amount) * 1.21  
    else:
        return None

#### 2.2 --- adjust_for_inflation

In [None]:
def adjust_for_inflation(df, column_name, year_column, new_column, drop_original=True):
    data = {
        "1990": 5.398,
        "1991": 4.235,
        "1992": 3.0288,
        "1993": 2.9517,
        "1994": 2.6074,
        "1995": 2.8054,
        "1996": 2.9312,
        "1997": 2.3377,
        "1998": 1.5523,
        "1999": 2.188,
        "2000": 3.3769,
        "2001": 2.8262,
        "2002": 1.586,
        "2003": 2.2701,
        "2004": 2.6772,
        "2005": 3.3927,
        "2006": 3.2259,
        "2007": 2.8527,
        "2008": 3.8391,
        "2009": -0.3555,
        "2010": 1.64,
        "2011": 3.1568,
        "2012": 2.0693,
        "2013": 1.4648,
        "2014": 1.6222,
        "2015": 0.1186,
        "2016": 1.2616,
        "2017": 2.1301,
        "2018": 2.4426,
        "2019": 1.8122,
        "2020": 1.2336,
        "2021": 4.6979
    }
    
    # Create a new column in the DataFrame to store the adjusted values
    df[new_column] = 0

    # Loop over the rows in the DataFrame
    for index, row in df.iterrows():
        # Get the year from the row
        year = row[year_column]
        # Skip the row if the value in the specified column is NaN
        if pd.isna(row[column_name]):
            continue
        # Get the inflation rate for each year from the dictionary
        inflation_rates = [data[str(yr)] for yr in range(year, 2022)]
        # Calculate the total inflation factor by multiplying the inflation rates together
        total_inflation_factor = 1
        for rate in inflation_rates:
            total_inflation_factor *= 1 + (rate / 100)
        # Get the value from the specified column
        value = row[column_name]
        # Adjust the value for inflation using the total inflation factor
        adjusted_value = value * total_inflation_factor
        # Round the result to two decimal places and store it in the new column
        df.at[index, new_column] = round(adjusted_value, 2)

    # Drop the original column if specified
    if drop_original:
        df = df.drop(columns=[column_name])

    # Return the DataFrame with the adjusted values
    return df


#### 3.1 --- one_hot_encoding_column

In [None]:
def one_hot_encoding_column(dataset, column, separator=", ", prefix=""):
    """
    Performs one-hot encoding on the specified column of the given dataset.
    dataset: The dataset to be processed.
    column: The name of the column to be one-hot encoded.
    separator: The separator used in the values of the specified column. Defaults to ",".
    prefix: Optional string to be added in front of each new column name. Defaults to "".
    returns: the new dataset with the specified column one-hot encoded.
    """

    # 1. Creating a list with all the values mentioned in the dataset
    value_list = [values.split(separator) for values in dataset[column]]

    # 2. Creating a set with value categories
    unique_v = {value for values in value_list for value in values}

    # 3. Performing one-hot encoding using get_dummies method
    value_subtable = pd.get_dummies(dataset[column].str.split(separator, expand=True).stack()).reset_index(level=1, drop=True)
    value_subtable = value_subtable.groupby(value_subtable.index).sum()

    # 4. Adding the prefix to the column names
    if prefix:
        value_subtable.columns = [prefix + str(col) for col in value_subtable.columns]

    # 5. Merging the subtable with the main dataset
    dataset_processed = pd.merge(dataset, value_subtable, left_index=True, right_index=True, how='left')
    dataset_processed.drop(columns=[column], inplace=True)

    # 6. Returning the new dataset
    return dataset_processed

#### 3.1 --- one_hot_coding_binary

In [None]:
def one_hot_coding_binary(dataset, original_column, prefix, file_column, file_location, separator=", ", num_categories=1, drop_original=True):
    if num_categories not in range(1,5):
        raise ValueError("num_categories must be between 1 and 4")

    for i in range(1, num_categories+1):
        dataset[f"{prefix}_no_{i}"] = dataset[original_column].str.split(separator, expand=True)[i-1]

    dataset_categories = pd.read_csv(file_location)

    for i in range(1, num_categories+1):
        replace = dataset[f"{prefix}_no_{i}"].isin(dataset_categories[file_column])
        dataset[f"{prefix}_no_{i}_binary"] = replace.astype(int)

    if drop_original:
        dataset.drop(columns=[original_column], inplace=True)

    if num_categories == 1:
        dataset.drop(columns=[f"{prefix}_no_1"], inplace=True)
        dataset.rename(columns={f"{prefix}_no_1_binary": f"{prefix}"}, inplace=True)
    else:
        for i in range(1, num_categories+1):
            dataset.drop(columns=[f"{prefix}_no_{i}"], inplace=True)
            dataset.rename(columns={f"{prefix}_no_{i}_binary": f"{prefix}_{i}"}, inplace=True)

        if num_categories == 5:
            dataset.rename(columns={f"{prefix}_all_binary": f"{prefix}_all"}, inplace=True)

    return dataset


### Dataset

In [None]:
df.info()

In [None]:
df.head(2)

### Title

#### Duplicates

In [None]:
duplicates = df[df.duplicated(['Title'], keep=False)].sort_values(by=['Title'])

In [None]:
print("Duplicates: " + str(len(duplicates)))

In [None]:
duplicates

### Rating

#### Missing values

In [None]:
#dropping all rows that have missing values in the column Rating
df = df.dropna(subset=['Rating'])

In [None]:
#A quick check to see that the row was removed from the dataset
df.info()

### Year

#### Outliers

In [None]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Year']
count = column_values.between(2002, 2023).sum()

### Month

#### Unqiue values

In [None]:
# Check all unique values in the column Month
column_values = df['Month'].unique()
print(column_values)

In [None]:
# Count the number of times a specific string value occurs in a column
count_2014 = df['Month'].value_counts()['2014']
print("2014: " + str(count_2014))
# Count the number of times a specific string value occurs in a column
count_2008 = df['Month'].value_counts()['2008']
print("2008: " + str(count_2008))

In [None]:
#Drop non Month-values
df = df.drop(index=df.loc[df['Month'] == '2014'].index)
df = df.drop(index=df.loc[df['Month'] == '2008'].index)

In [None]:
#checking the rows have been dropped
df.info()
column_values = df['Month'].unique()
print(column_values)

#### Converting non-numeric to numeric values

In [None]:
# Define a dictionary to map months to integers
month_to_int = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
                'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

# Apply the map() method to convert the values
df['Month'] = df['Month'].map(month_to_int)

# Convert the type of the column to int
df['Month'] = df['Month'].astype(int)

In [None]:
# Check the new column
column_values = df['Month'].unique()
column_values.sort()
print("Unique values: " + str(column_values))
df.info()

### Certificate

#### Unqiue values

In [None]:
unique_values = df['Certificate'].unique()
print(unique_values)
unique_values_count = df['Certificate'].value_counts()
print(unique_values_count)

In [None]:
#Drop 'Not Rated', 'Untated' and missing values
df = df.dropna(subset=['Certificate'])
df = df[~df['Certificate'].isin(['Not Rated', 'Unrated'])]

In [None]:
# Check that the rows have been dropped
unique_values = df['Certificate'].unique()
print(unique_values)
unique_values_count = df['Certificate'].value_counts()
print(unique_values_count)
df.info()

#### Convert Certificate to numeric with One hot encoding

In [None]:
df = one_hot_encoding_column(df, "Certificate", prefix="cert_")

In [None]:
#Check the result
df.info()

In [None]:
df.head(2)

### Runtime

In [None]:
#Checking non numeric values
check_non_numeric_values(df, "Runtime")

In [None]:
# Convert to integers
df['Runtime'] = df['Runtime'].astype(int)

In [None]:
#Check the result
df.info()

### Directors

In [None]:
df.head(2)

#### Convert Directors to numeric with One hot encoding

In [None]:
num_directors = 2
prefix_col = "top_50_director"
df = one_hot_coding_binary(df, "Directors", prefix_col, "Name", "../data/top_50_directors.csv", num_categories=num_directors, drop_original=True)

In [None]:
#Check the result
df.head(2)

#### Stars

In [None]:
df.head(2)

#### Convert Stars to numeric with One hot encoding

In [None]:
num_stars = 4
prefix_col = "top_1000_Stars"
df = one_hot_coding_binary(df, "Stars", prefix_col, "Name", "../data/top_1000_actors.csv", num_categories=num_stars, drop_original=True)

In [None]:
#Check the result
df.head(2)

In [None]:
df.info()

### Genre

#### Convert Genre to numeric with One hot encoding

In [None]:
df = one_hot_encoding_column(df, "Genre", prefix = "genre_")

In [None]:
#Check result
df.head(2)

In [None]:
df.info()

### Filming location

In [None]:
df.head(2)

In [None]:
#### Unique values
unique_values = df['Filming_location'].unique()
print(unique_values)
unique_values_count = df['Filming_location'].value_counts()
print(unique_values_count)

#### Dropping column

In [None]:
df = df.drop('Filming_location', axis=1)

In [None]:
#Check the result
df.info()

### Country of Origin

In [None]:
df.head(2)

In [None]:
#### Unique values
unique_values = df['Country_of_origin'].unique()
print(unique_values)
unique_values_count = df['Country_of_origin'].value_counts()
print(unique_values_count)

In [None]:
# split the values based on the comma and space separator, and explode them into individual rows
unique_country = df['Country_of_origin'].str.split(', ', expand=True).stack()
counts = unique_country.value_counts()
print(counts)

#### Converting Country_of_origin to Continent_of_origin

In [None]:
# Create a dictionary to map countries to continents
country_to_continent = {}
for index, row in df_continents.iterrows():
    country_to_continent[row["Country"]] = row["Continent"]

# Define a function to apply the mapping to each row in the column
def countries_to_continents(countries):
    continents = set()
    for country in countries.split(", "):
        if country in country_to_continent:
            continents.add(country_to_continent[country])
        else:
            continents.add("Unknown")
    return ", ".join(sorted(list(continents)))

# Apply the function to the column
df["Country_of_continent"] = df["Country_of_origin"].apply(countries_to_continents)

In [None]:
#Check result
df.head(2)

In [None]:
#Check for Unknows value
value_counts = df["Country_of_continent"].value_counts()

# Count the number of times "unknown" appears in the Country_of_origin column
unknown_count = df["Country_of_continent"].apply(lambda x: "Unknown" in x).sum()
unknown_rows = df[df["Country_of_continent"].str.contains("Unknown", regex=False)]

# Print the result
print(f"The number of times 'unknown' appears in Country_of_continent is: {unknown_count}")
unknown_rows.head()

#### Convert Contry of continent to numeric with one hot encoding

In [None]:
df = one_hot_encoding_column(df, "Country_of_continent", prefix = "contient_")

In [None]:
#Check result
df.head(2)

In [None]:
# Drop original column
df = df.drop('Country_of_origin', axis=1)

In [None]:
#Check the result
df.head(2)

In [None]:
df.info()

### Income / Budget

In [None]:
df.head(2)

#### Convert to USD and strip of non numeric characters

In [None]:
df['Budget'] = df['Budget'].apply(convert_to_usd)
df['Income'] = df['Income'].apply(convert_to_usd)

In [None]:
#Check result
df.head(2)

#### Calculation Inflation

In [None]:
df = adjust_for_inflation(df, "Budget", "Year", "Budget_inf", drop_original=True)
df = adjust_for_inflation(df, "Income", "Year", "Income_inf", drop_original=True)

In [None]:
#Check result
df.head(2)

#### Missing Values

In [None]:
#Since the missing values are 0 instead of NaN at this point (and 0 would also be treated like missing value )

# count the number of occurrences of 0 in col1
count_col1 = (df['Budget_inf'] == 0).sum()

# count the number of occurrences of 0 in col2
count_col2 = (df['Income_inf'] == 0).sum()

# count the number of occurrences of 0 in both col1 and col2
count_both = ((df['Budget_inf'] == 0) & (df['Income_inf'] == 0)).sum()

# print the results
print('Number of zeros in col1:', count_col1)
print('Number of zeros in col2:', count_col2)
print('Number of zeros in both col1 and col2:', count_both)

In [None]:
#Dropping rows with missing values in both columns
df = df[(df['Budget_inf'] != 0) | (df['Income_inf'] != 0)]

In [None]:
#Check result
count_both = ((df['Budget_inf'] == 0) & (df['Income_inf'] == 0)).sum()
print('Number of zeros in both col1 and col2:', count_both)

#### Create Profit column

In [None]:
df['Profit_inf'] = df['Income_inf'] - df['Budget_inf']
mask = (df['Income_inf'] == 0) | (df['Budget_inf'] == 0)
df.loc[mask, 'Profit_inf'] = 0

In [None]:
#Check result
df.head(2)

#### Create ROI column

In [None]:
df['ROI_inf'] = (df['Income_inf'] - df['Budget_inf']) / df['Income_inf']
mask = (df['Income_inf'] == 0) | (df['Budget_inf'] == 0)
df.loc[mask, 'ROI_inf'] = 0

In [None]:
#calc mean & median roi of the rows w/ values in both
mask = df['ROI_inf'] != 0
df_filtered = df[mask]
mean_roi = df_filtered['ROI_inf'].mean()
median_roi = df_filtered['ROI_inf'].median()

In [None]:
print("Mean_roi: " + str(mean_roi))
print("Median_roi: " + str(median_roi))

#### Replace missing values with mean ROI

In [None]:
df['ROI_inf'] = df['ROI_inf'].replace(0, median_roi)

In [None]:
#Check result
df.head(2)

#### Replace missing values in Budget and Income

In [None]:
df.loc[df['Income_inf'].isna() | (df['Income_inf'] == 0), 'Income_inf'] = df['Budget_inf'] * (1 + median_roi)
df.loc[df['Budget_inf'].isna() | (df['Budget_inf'] == 0), 'Budget_inf'] = df['Income_inf'] / (1 + median_roi)

In [None]:
#Check result
df.head()

In [None]:
#### Update Profit
df['Profit_inf'] = df['Income_inf'] - df['Budget_inf']

In [None]:
#Check result
df.head()

#### Check df after dataprocessing

In [None]:
#Remove Title
df.drop('Title', axis=1, inplace=True)

In [None]:
df.info()

In [None]:
#df.to_csv('01_dataprocessing_noTitle.csv', index=False)