In [None]:
import json
import pickle as pkl
import pandas as pd
import uuid
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/AVE/BERT_multimodal_transformer/custom_dataset

/content/drive/MyDrive/AVE/BERT_multimodal_transformer/custom_dataset


In [None]:
train = pd.read_csv('train.tsv', sep='\t')
dev = pd.read_csv('val.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')

In [None]:
def attr_spec_sample(df, specified_attr, sample_num=None, sample_ratio=None):
        """ specify attributes and sampling """
        # Filter the DataFrame based on 'specified_attr'.
        df_selected = df[df['attribute_names'].isin(specified_attr)]

        # Unique attribute values in column 'attribute_names'
        sampled_attribute_value = df_selected['attribute_values'].unique()
        num_unique_attr_values = len(sampled_attribute_value)

        if sample_num or sample_ratio:
            # This will be a dictionary where the keys are unique attribute values and the values are the sampled DataFrames.
            sampled_dfs = {}

            # We will use groupby to group the DataFrame by 'attribute_values' and then sample K from each group.
            for attribute_value, group in df_selected.groupby('attribute_values'):
                # Ensure there are at least K examples to sample, otherwise take as many as are available.
                if sample_num:
                    n_samples = min(len(group), sample_num)
                else:
                    n_samples = round(len(group) * sample_ratio)
                sampled_dfs[attribute_value] = group.sample(n=n_samples)

            # Concatenate the individual DataFrames.
            df_samples = pd.concat(sampled_dfs.values(), ignore_index=True)
        else:
            df_samples = df_selected

        print('Specified Attribute list: ', specified_attr)
        if sample_num:
            print('Specified Sample Num Per Attribute Value: ', sample_num)
        else:
            print('Specified Sample Ratio Per Attribute Value: ', sample_ratio)
        print('Split size: ', len(df))
        print('Split size after specifying attributes: ', len(df_selected))
        print('Split size after sampling: ', len(df_samples))
        print('Numumber of Unique Attribute Value: ', len(sampled_attribute_value))
        print('Attribute Value List: ', sampled_attribute_value)

        return df_samples

In [None]:
def bert_preprocess(df, id_to_visual_embeddings):
    data = []

    df['prompt'] = df.apply(lambda row: f"Question: What is {row['attribute_names']} of this product?\nContext: [Category] {row['category']} {row['value_absent_texts']}.", axis=1)
    id_to_prompts = df.groupby('id')['prompt'].apply(list).to_dict()

    unique_attribute_values = df['attribute_values'].unique()
    attribute_value_to_label = {value: label for label, value in enumerate(unique_attribute_values)}

    df['label'] = df['attribute_values'].map(attribute_value_to_label)
    id_to_label = df.groupby('id')['label'].apply(list).to_dict()

    for id in id_to_prompts:
      if id not in id_to_visual_embeddings:
        continue

      texts = id_to_prompts[id]
      visual = id_to_visual_embeddings[id]
      labels = id_to_label[id]

      for i in range(len(texts)):
        text = texts[i]
        label = labels[i]
        segment = str(uuid.uuid4())
        data.append(((text,visual),label,segment))

    return data


In [None]:
with open('ave_image_embeddings.pkl', 'rb') as handle:
    id_to_visual_embeddings = pkl.load(handle)

In [None]:
ids_to_keep = set(id_to_visual_embeddings.keys())
train = train[train['id'].isin(ids_to_keep)]
dev = dev[dev['id'].isin(ids_to_keep)]
test = test[test['id'].isin(ids_to_keep)]

In [None]:
subset_mapping = {"Clothing":["Neckline", "Length", "Sleeve Style", "Shoulder Style"],
                  "Footwear":["Athletic Shoe Style", "Boot Style", "Shaft Height", "Heel"],
                  "General":["Pattern", "Material", "Shape"]}

In [None]:
for key in subset_mapping:
  curr_dev = attr_spec_sample(dev, subset_mapping[key])
  curr_test = attr_spec_sample(test, subset_mapping[key])

  curr_dev = bert_preprocess(curr_dev,id_to_visual_embeddings)
  curr_test = bert_preprocess(curr_test,id_to_visual_embeddings)

  for i in [5,10,15,25,50,100,1000]:
    subset = attr_spec_sample(train, subset_mapping[key], sample_num=100)
    subset = attr_spec_sample(train, subset_mapping[key], sample_num=i)
    curr_train = bert_preprocess(subset,id_to_visual_embeddings)
    split_data = {"train":curr_train,
                  "dev": curr_dev,
                  "test": curr_test}

    save_name = f'bert_{key}_{i}_shot.pkl'
    with open(save_name, 'wb') as file:
      pkl.dump(split_data, file)

    print(f'Done with {save_name}')

Specified Attribute list:  ['Neckline', 'Length', 'Sleeve Style', 'Shoulder Style']
Specified Sample Ratio Per Attribute Value:  None
Split size:  8438
Split size after specifying attributes:  2957
Split size after sampling:  2957
Numumber of Unique Attribute Value:  30
Attribute Value List:  ['Halter' 'Gown' '3/4 Sleeve' 'Mini' 'Round Neck' 'Polo' 'Button Down'
 'Pencil' 'Midi' 'Cold Shoulder' 'One Shoulder' 'Crew Neck' 'Strapless'
 'Skater Skirt' 'Short Dress' 'Henley' 'Strappy' 'Scoop Neck'
 'Long Sleeve' 'Long Dress' 'High Neck' 'Capri' 'V-Neck' 'Cowl Neck'
 'Off Shoulder' 'Short Sleeve' 'Square Neck' 'Turtleneck' 'Sleeveless'
 'Cap Sleeve']
Specified Attribute list:  ['Neckline', 'Length', 'Sleeve Style', 'Shoulder Style']
Specified Sample Ratio Per Attribute Value:  None
Split size:  8349
Split size after specifying attributes:  3075
Split size after sampling:  3075
Numumber of Unique Attribute Value:  30
Attribute Value List:  ['Pencil' 'Long Dress' 'Round Neck' 'Turtleneck' 'Sh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['prompt'] = df.apply(lambda row: f"Question: What is {row['attribute_names']} of this product?\nContext: [Category] {row['category']} {row['value_absent_texts']}.", axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['attribute_values'].map(attribute_value_to_label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.

Specified Attribute list:  ['Neckline', 'Length', 'Sleeve Style', 'Shoulder Style']
Specified Sample Num Per Attribute Value:  100
Split size:  38903
Split size after specifying attributes:  13931
Split size after sampling:  3000
Numumber of Unique Attribute Value:  30
Attribute Value List:  ['Long Sleeve' 'Crew Neck' 'Off Shoulder' '3/4 Sleeve' 'Long Dress'
 'Pencil' 'Cold Shoulder' 'Midi' 'Cowl Neck' 'Square Neck' 'Skater Skirt'
 'Polo' 'Halter' 'Henley' 'Cap Sleeve' 'Short Sleeve' 'Strapless' 'Capri'
 'Mini' 'Sleeveless' 'Short Dress' 'Round Neck' 'One Shoulder' 'V-Neck'
 'Button Down' 'Gown' 'Scoop Neck' 'Turtleneck' 'Strappy' 'High Neck']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['attribute_values'].map(attribute_value_to_label)
