### Data Preparation for Base Model

In [9]:
import pandas as pd
import ast
import numpy as np

In [None]:
df = pd.read_csv("data/train_classification_labels.csv")

In [5]:
df.head()

Unnamed: 0,id,categories
0,3b6f01ae-5bde-434d-9b06-79b269421ed6,[1.0]
1,dce21f7c-20e5-482b-bd90-c038f8464c03,[1.0]
2,4a7f2199-772d-486d-b8e2-b651246316b5,[1.0]
3,3bddedf6-4ff8-4e81-876a-564d2b03b364,"[1.0, 9.0, 11.0, 88.0]"
4,3f735021-f5de-4168-b139-74bf2859d12a,"[1.0, 37.0, 51.0, 119.0]"


In [8]:
def parse_categories(cat_str):
    # ast.literal_eval safely evaluates the string to a list
    cats = ast.literal_eval(cat_str)
    # Convert each category from float to int
    return [int(x) for x in cats]

# Apply the parsing function
df['category_list'] = df['categories'].apply(parse_categories)

df.head()

Unnamed: 0,id,categories,category_list
0,3b6f01ae-5bde-434d-9b06-79b269421ed6,[1.0],[1]
1,dce21f7c-20e5-482b-bd90-c038f8464c03,[1.0],[1]
2,4a7f2199-772d-486d-b8e2-b651246316b5,[1.0],[1]
3,3bddedf6-4ff8-4e81-876a-564d2b03b364,"[1.0, 9.0, 11.0, 88.0]","[1, 9, 11, 88]"
4,3f735021-f5de-4168-b139-74bf2859d12a,"[1.0, 37.0, 51.0, 119.0]","[1, 37, 51, 119]"


In [10]:
# Define the total number of classes
num_classes = 290

# Function to create a multi-hot vector from the category list
def multi_hot_vector(categories, num_classes):
    vector = np.zeros(num_classes, dtype=np.float32)
    for cat in categories:
        # Assuming CSV classes are 1-indexed, convert to 0-indexed:
        index = cat - 1
        if 0 <= index < num_classes:
            vector[index] = 1.0
    return vector

# Create a new column in the DataFrame with the multi-hot vectors
df['multi_hot'] = df['category_list'].apply(lambda x: multi_hot_vector(x, num_classes))

df.head()

Unnamed: 0,id,categories,category_list,multi_hot
0,3b6f01ae-5bde-434d-9b06-79b269421ed6,[1.0],[1],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,dce21f7c-20e5-482b-bd90-c038f8464c03,[1.0],[1],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4a7f2199-772d-486d-b8e2-b651246316b5,[1.0],[1],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3bddedf6-4ff8-4e81-876a-564d2b03b364,"[1.0, 9.0, 11.0, 88.0]","[1, 9, 11, 88]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
4,3f735021-f5de-4168-b139-74bf2859d12a,"[1.0, 37.0, 51.0, 119.0]","[1, 37, 51, 119]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


#### Checking an Example

In [13]:
df.iloc[3, ]

id                            3bddedf6-4ff8-4e81-876a-564d2b03b364
categories                                  [1.0, 9.0, 11.0, 88.0]
category_list                                       [1, 9, 11, 88]
multi_hot        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
Name: 3, dtype: object

In [12]:
df.iloc[3, ]["multi_hot"]

array([1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

Looks good! This will be run in image loading pipeline