In [1]:
import pandas as pd

In [2]:
dataset_raw = pd.read_csv('movies.csv')
dataset_raw.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin
0,Avatar: The Way of Water,7.8,2022,December,PG-13,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States
1,Guillermo del Toro's Pinocchio,7.6,2022,December,PG,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France"


In [4]:
def one_hot_encoding_dummy(dataset, column, separator=",", y_column= "Title"):
    """
    Performs one-hot encoding on the specified column of the given dataset.
    dataset: The dataset to be processed.
    column: The name of the column to be one-hot encoded.
    separator: The separator used in the values of the specified column. Defaults to ",".
    returns: the new dataset with the specified column one-hot encoded.
    """

    # 1. Creating a list with all the values mentioned in the dataset
    value_list = []
    for values in dataset[column]:
        value_list.append(values.split(separator))

    # 2. Creating a list with value categories
    unique_v= set()
    for values in value_list:
        for value in values:
            unique_v.add(value)

    unique_values = list(unique_v)
    amount = len(unique_values)

    # 3. Creating a subtable with y_column as the first column followed by one 
    # column per value
    value_subtable = pd.DataFrame(columns=[y_column] + unique_values)

    # 4. For each row, assign 1 to those values it has been assigned 
    for i, row in dataset.iterrows():
        new_row = {y_column: row[y_column]}
        for value in unique_values:
            new_row[value] = 0
        for value in row[column].split(separator):
            new_row[value] = 1
        value_subtable = value_subtable.append(new_row, ignore_index=True)

    # 5. Merging the subtable with the main dataset
    pivot_table = value_subtable.melt(id_vars=y_column, var_name=column)
    pivot_table = pivot_table[pivot_table['value'] == 1]
    pivot_table = pivot_table.drop(columns=['value'])
    pivot_table = pivot_table.pivot_table(index=y_column, columns=column, aggfunc='size', fill_value=0).reset_index()

    # 7. Merge the pivot table with the original DataFrame
    dataset_processed = pd.merge(dataset, pivot_table, on=y_column, how='left')
    dataset_processed.drop(columns=[column], inplace=True)

    # 8. Returning the new dataset
    return dataset_processed


dataset_processed = one_hot_encoding_dummy(dataset_raw, "Genre")


In [None]:
dataset_processed.head()

In [5]:
#dataset_processed.to_csv('C:/zlatte1/dataset_encoding.csv', index=False)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Filming_location,Budget,...,Comedy,Crime,Drama,Family,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,Avatar: The Way of Water,7.8,2022,December,PG-13,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",New Zealand,"$350,000,000",...,0,0,0,0,0,0,0,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,December,PG,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...",USA,"$35,000,000",...,0,0,0,0,0,0,0,0,0,0
2,Bullet Train,7.3,2022,August,R,127,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...",Japan,"$85,900,000",...,0,0,0,0,0,0,0,0,0,0
3,The Banshees of Inisherin,7.8,2022,November,R,114,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...",Ireland,Unknown,...,1,0,0,0,0,0,0,0,0,0
4,M3gan,6.4,2022,December,PG-13,102,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...",New Zealand,"$12,000,000",...,0,0,0,0,0,1,0,0,0,0


In [None]:
#dataset_encoding.to_csv('C:/zlatte1/dataset_encoding.csv', index=False)

In [None]:
#dataset_encoding2 =  one_hot_encoding_dummy("Country_of_origin", ", ", dataset_encoding, "Title" )

In [None]:
#dataset_encoding2.head()

In [None]:
dataset_processed2 = one_hot_encoding_dummy(dataset_processed, "Country_of_origin")

In [None]:
dataset_processed2.head()