### Importing Libraries

In [None]:
import pandas as pd
#import cpi #library for inflation-data
df = pd.read_csv('C:/Users/admin1/Documents/GitHub/ds22_project/data/movies.csv')

### Functions

#### 1.1 --- check_non_numeric_values

In [None]:
def check_non_numeric_values(df, column):
    """Function takes in dataset and column. No kreturn, Printing out found non numeric values in the column."""

    # convert column to numeric data type
    numeric_col = pd.to_numeric(df[column], errors='coerce')

    # get the non-numeric values and their counts
    non_numeric_values = df[column][numeric_col.isna()].value_counts()

    # check if there are any non-numeric values
    if non_numeric_values.empty:
        print("No non numeric values in that column.")
    else:
        # create a table with non-numeric values and their counts
        non_numeric_table = pd.DataFrame({'Non-Numeric Value': non_numeric_values.index,
                                          'Count': non_numeric_values.values})

        # display the table
        print(non_numeric_table)

#### 1.2 --- one_hot_encoding_column

In [None]:
#function
def one_hot_encoding_column(dataset, column, separator=", ", prefix=""):
    """
    Performs one-hot encoding on the specified column of the given dataset.
    dataset: The dataset to be processed.
    column: The name of the column to be one-hot encoded.
    separator: The separator used in the values of the specified column. Defaults to ",".
    prefix: Optional string to be added in front of each new column name. Defaults to "".
    returns: the new dataset with the specified column one-hot encoded.
    """

    # 1. Creating a list with all the values mentioned in the dataset
    value_list = [values.split(separator) for values in dataset[column]]

    # 2. Creating a set with value categories
    unique_v = {value for values in value_list for value in values}

    # 3. Performing one-hot encoding using get_dummies method
    value_subtable = pd.get_dummies(dataset[column].str.split(separator, expand=True).stack()).reset_index(level=1, drop=True)
    value_subtable = value_subtable.groupby(value_subtable.index).sum()

    # 4. Adding the prefix to the column names
    if prefix:
        value_subtable.columns = [prefix + str(col) for col in value_subtable.columns]

    # 5. Merging the subtable with the main dataset
    dataset_processed = pd.merge(dataset, value_subtable, left_index=True, right_index=True, how='left')
    dataset_processed.drop(columns=[column], inplace=True)

    # 6. Returning the new dataset
    return dataset_processed

#### 1.3 --- convert_to_usd

In [None]:
#converting all currency to USD and removing all commas.

def convert_to_usd(amount):
    amount.replace(' ', '')
    amount.replace('\xa0', '')
    if amount.startswith('$'):
        amount = amount.strip('$').replace(',', '')   # must remove commas
        return float(amount)   # convert str into float
    elif amount.startswith('€'):
        # Exchange rate for EUR to USD
        amount = amount.strip('€').replace(',', '')
        return float(amount) * 1.06 
    elif amount.startswith('¥'):
        # Exchange rate for YEN to USD
        amount = amount.strip('¥').replace(',', '')
        return float(amount) * 0.0075
    elif amount.startswith('₹'):
        # Exchange rate for RPL to USD
        amount = amount.strip('₹').replace(',', '')
        return float(amount) * 0.012 
    elif amount.startswith('SEK'):
        # Exchange rate for SEK to USD
        amount = amount.strip('SEK').replace(',', '')
        return float(amount) * 0.094
    elif amount.startswith('DKK'):
        # Exchange rate for RPL to USD
        amount = amount.strip('DKK').replace(',', '')
        return float(amount) * 0.14
    elif amount.startswith('£'):
        # Exchange rate for RPL to USD
        amount = amount.strip('£').replace(',', '')
        return float(amount) * 1.21  
    else:
        return None

#### 1.4 --- one_hot_coding_binary

In [None]:
import pandas as pd

def one_hot_coding_binary(dataset, original_column, prefix, file_column, file_location, separator=", ", num_categories=1, drop_original=True):
    if num_categories not in range(1,5):
        raise ValueError("num_categories must be between 1 and 4")

    for i in range(1, num_categories+1):
        dataset[f"{prefix}_no_{i}"] = dataset[original_column].str.split(separator, expand=True)[i-1]

    dataset_categories = pd.read_csv(file_location)

    for i in range(1, num_categories+1):
        replace = dataset[f"{prefix}_no_{i}"].isin(dataset_categories[file_column])
        dataset[f"{prefix}_no_{i}_binary"] = replace.astype(int)

    if drop_original:
        dataset.drop(columns=[original_column], inplace=True)

    if num_categories == 1:
        dataset.drop(columns=[f"{prefix}_no_1"], inplace=True)
        dataset.rename(columns={f"{prefix}_no_1_binary": f"{prefix}"}, inplace=True)
    else:
        for i in range(1, num_categories+1):
            dataset.drop(columns=[f"{prefix}_no_{i}"], inplace=True)
            dataset.rename(columns={f"{prefix}_no_{i}_binary": f"{prefix}_{i}"}, inplace=True)

        if num_categories == 5:
            dataset.rename(columns={f"{prefix}_all_binary": f"{prefix}_all"}, inplace=True)

    return dataset


# Dataprocessing

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
df.describe()

#### Dataprocessing Overview

create a table / column / object / comment

### 1.Title

#### 1.1Unqiue/ Dupclicate values

In [None]:
num_unique_values = df['Title'].nunique()

In [None]:
print(num_unique_values)

My first instict was to expect 2000 unique values for titles of movies.
So I thought to just remove them from the dataset since I expect them to be duplicate data.
But.

In [None]:
duplicates = df[df.duplicated(['Title'], keep=False)].sort_values(by=['Title'])

duplicates

Looking at the duplicate movie titles we can quickly see that its totally diffrent movies that only share the title name and nothing else therefore no duplicate data and we can keep it.
I do expect that we drop this column since we cant make in to numeric.
Well, you could keep it and count the length of the title but I would say that shouldnt have an effect on the model and just be in the way.

### 2.Rating

Rating of the movie, the user rates 0-10 and this it the average of that voting with 1 decimal. This is our y, the data that we want to predict. 

#### 2.1 Missing data
According to the info-function, there is one movie that doesn´t have a value. We cant do more than just drop that one from the dataset.



In [None]:
df = df.dropna(subset=['Rating'])


#A quick check to see that the row was removed from the dataset
df.info()


Since this is what our model is gonna predict, I dont want to do more here. The type is float64, which tells us that all values is numeric and can be decimal and since imdb is ratings from 0-10 with decimals this seems correct. One thing to consider would be to 10x all the values in ratings and convert the column to an int since that will be quicker to work with.
Actually wann quick check that all the values is between 0 and 10.

#### 2.2 Check the data is within expected range

In [None]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Rating']
count = column_values.between(0, 10).sum()

In [None]:
print(count)

Our orginial dataset consistet of 2000 rows and we dropped one with missing values so 1999 was what we expected and hoped for. 


### 3.Year

#### 3.1 Check the data is within expected range

In [None]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Year']
count = column_values.between(2002, 2023).sum()

In [None]:
print(count)

The type of the column is int which make sense aswell.

### 4.Month

#### 4.1 Unqiue/ Dupclicate values



In [None]:
column_values = df['Month'].unique()
print(column_values)

Seeing two values that I didnt expect,  2014 and 2018. Start with checking number of times they appear.

In [None]:
# Count the number of times a specific string value occurs in a column
count = df['Month'].value_counts()['2014']
print(count)

In [None]:
# Count the number of times a specific string value occurs in a column
count = df['Month'].value_counts()['2008']
print(count)

Since they just occurs one time each in the Month-column I suggest we drop them since 2/ 1999 rows one impact our size of dataset especcially much and not seeing it beeing the worth the time to save

In [None]:
df = df.drop(index=df.loc[df['Month'] == '2014'].index)
df = df.drop(index=df.loc[df['Month'] == '2008'].index)

In [None]:
#checking the rows have been dropped
df.info()

In [None]:
column_values = df['Month'].unique()
print(column_values)

#### 4.2 Converting non-numeric to numeric values



In [None]:
# Define a dictionary to map months to integers
month_to_int = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
                'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

# Apply the map() method to convert the values
df['Month'] = df['Month'].map(month_to_int)

# Convert the type of the column to int
df['Month'] = df['Month'].astype(int)

In [None]:
#checking the month-values has been rplaced by 1-12 and the column converted to int
df.info()

In [None]:
column_values = df['Month'].unique()
column_values.sort()
print(column_values)

### 5.Certificate

In [None]:
df.info()

In [None]:
### 5.1 Unique values

In [None]:
# Get all unique values of a column
unique_values = df['Certificate'].unique()

In [None]:
print(unique_values)

In [None]:
# Get the count of each unique value in the column, including missing values
value_counts = df['Certificate'].value_counts(dropna=False)

# Print the value counts
print(value_counts)

# Calculate the sum of the counts and print the total
total = value_counts.sum()
print(f'Total: {total}')

A couple of things. Tree columns stand out.
Not Rated     61   --- probably takes a while from a movie is realsed until it gets rated. Make sense that this mostly consists of movies from the past year
NaN           32   --- not sure how to replace
Unrated        6   --- not sure how to replace

At this point Im not sure how to replace the missing values and w/ ca 100 rows w/ diffrent kind of missing values its a bit much to drop them.
My conclusion is to drop the whole column.


### 6.Runtime

#### 6.1 Check non numeric

In [None]:
#Checking non numeric values
check_non_numeric_values(df, "Runtime")

In [None]:
df = df.drop(index=df.loc[df['Runtime'] == 'Unknown'].index)

In [None]:
#check that the row has been dropped
df.info()

In [None]:
#converting the type of the column to int
df['Runtime'] = df['Runtime'].astype(int)

In [None]:
#checking the type has been changed
df.info()

check for outliers, since movies should maybe have a range from 30-250min isch.
So if i found values >30 or over 300 min I can assumme that something is wrong and drop them

In [None]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Runtime']
count = column_values.between(30, 300).sum()

In [None]:
print(count)

Seems all remaining rows has a value between 30-300min which is good news.

### 7.Directors

In [None]:
df.info()

In [None]:
#calling the function
df = one_hot_coding_binary(df, "Directors", "top_50_director", "Name", "C:/Users/admin1/Documents/GitHub/ds22_project/data/top_50_directors.csv", num_categories=4, drop_original=True)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
#checking sum of each row
director_one = df['top_50_director_1'].sum()
director_two = df['top_50_director_2'].sum()
director_three = df['top_50_director_3'].sum()
director_four = df['top_50_director_4'].sum()


df_directors = pd.DataFrame({'director_one': [director_one], 'director_two': [director_two], 'director_three': [director_three], 'director_four': [director_four]})

df_directors.head()


### 8.Stars

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
#calling the function
df = one_hot_coding_binary(df, "Stars", "top_1000_Stars", "Name", "C:/Users/admin1/Documents/GitHub/ds22_project/data/top_1000_actors.csv", num_categories=4, drop_original=True)


In [None]:
df.head(2)

In [None]:
df.info()

### 9.Genre

In [None]:
df.info()

In [None]:
#one hot encoding the column Genre
df = one_hot_encoding_column(df, "Genre", separator=", ", prefix = "")

In [None]:
#checking new dataset
df.info()
df.head(2)

### 9.Filmning_location

In [None]:
df.info()

In [None]:
df.head(2)

In [None]:
# assume that `df` is your pandas DataFrame object
column_values = df['Filming_location'].value_counts().sort_values(ascending=False)
print(column_values)

In [None]:
#Seeing 75 movies with Unknown filming_location. How can we replace them? and seeing 97 unique filming locations.
#We we´re discussing if movies mostly are beeing done w/ green screen.
#maybe remove the whole column?

#one hot encoding the column Filming_location
#df = one_hot_encoding_column(df, "Filming_location", separator=", ", prefix = "")

### 11.Budget

### 12.Income

### 13.Country_of_origin

In [None]:
df.info()

In [None]:
df.head(2)

In [None]:
#one hot encoding the column Genre
df = one_hot_encoding_column(df, "Country_of_origin", separator=", ", prefix = "")

In [None]:
df.info()

In [None]:
df.head(2)