# Library requirements

In [None]:
!pip install -q transformers datasets

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import random
import numpy as np
from tqdm import tqdm

# Text data preprocessing

In [None]:
# getting the dataset from huggingface
dataset = load_dataset("ashraq/fashion-product-images-small")
dataset

In [None]:
# putting only the required columns of the dataset (from the train split) into a dataframe
data_for_df = { 'id': dataset['train']['id'],
                'gender' : dataset['train']['gender'],
                'category': dataset['train']['masterCategory'],
                'text': dataset['train']['productDisplayName'],
                # 'addition1': dataset['train']['subCategory'],
                'addition2': dataset['train']['season'],
                'addition3': dataset['train']['usage']

}

df = pd.DataFrame(data_for_df)
df

In [None]:
# seeing the unique values for gender and category columns
display(df['gender'].value_counts())
display(df['category'].value_counts())

In [None]:
# selecting only the values 'men' and 'women' for the gender column
# selecting only the value 'apparel' for the category column
df = df.loc[((df['gender'] == 'Men') | (df['gender'] == 'Women')),:]
df = df.loc[(df['category'] == 'Apparel'),:]

In [None]:
display(df['gender'].value_counts())
display(df['category'].value_counts())

In [None]:
# changing 'gender' class labels to numerical categories
map_to_num = {'Men': 0, 'Women': 1}
df['gender'] = df['gender'].map(map_to_num)
df

In [None]:
y = df['gender'].tolist()

# reducing the size of the dataframe, since decision trees can work with less data
# and this would make analysis easier
df_reduced, _, = train_test_split(df,test_size=0.9, random_state= 42, stratify=y)
df_reduced

In [None]:
df = df_reduced
display(df['gender'].value_counts())

In [None]:
# combining some columns to make a single description for each product

df['text'] = df['text'] + " " + df['addition2'] + " " + df['addition3']
df

In [None]:
# delete the redundant columns now
df.drop(['category', 'addition2', 'addition3'], axis = 1, inplace = True)
df

In [None]:
text_list = df['text'].tolist()
id_list = df['id'].tolist()
dictionary = dict(zip(id_list, text_list))

words = ['men', 'women', 'mens', 'womens', 'man', 'woman', 'Men', 'Women', 'Mens', 'Womens', 'Man', 'Woman', 'Men\'s', 'Women\'s', 'men\'s', 'women\'s']

for key, value in dictionary.items():
  for each_word in value.split():
    if each_word in words:
      new_text = value.replace(each_word,'')
      dictionary[key] = new_text

In [None]:
gender_list = df['gender'].tolist()

temp_dictionary = {
    'id': dictionary.keys(),
    'text': dictionary.values(),
    'gender': gender_list
}

In [None]:
df = pd.DataFrame.from_dict(temp_dictionary)
df

In [None]:
df = df.sort_values(by=["id"])
df

# Text data feature extraction

In [None]:
#Splitting dataset into features (Feature vector - X) and target variables (Labels- Y)
# Feature Vector - X
X = df[['id','text']]

# saving this becasuse it has the preprocessed text which is needed later for explanation
X_for_text_explanation = X
X_for_text_explanation.to_csv("X_for_text_explanation.csv")

# Target Variables- Y
y = df['gender'].tolist()

display(X)
display(y)

In [None]:
device =  'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
# get the clothing text model
tokenizer = AutoTokenizer.from_pretrained("Showroom/clothing_subcategory_classifier")
model = AutoModelForSequenceClassification.from_pretrained("Showroom/clothing_subcategory_classifier")
model = model.to(device)

In [None]:
# just running one sentence to get all classes/features the clothing text model has
feature_list = []
first_sent = X['text'][0]

inputs = tokenizer(first_sent, return_tensors="pt").to(device)
outputs = model(**inputs)
logits = outputs.logits

topk = torch.topk(logits, 8).indices # out of index error with numbers>8 so this model calculates 8 features
for each_value in topk[0]:
  feature_list.append(model.config.id2label[each_value.item()])

display(feature_list)

In [None]:
new_df = pd.DataFrame(columns = ['id', feature_list[0], feature_list[1], feature_list[2], feature_list[3], feature_list[4],
           feature_list[5], feature_list[6], feature_list[7]])
new_df.head()

In [None]:
# get the values for 8 features for all products now (as calculated above,
# the pre-trained model calculates values for 8 fixed features)
all_sents = X['text'].tolist()

all_sents= all_sents
n = len(feature_list)
all_lists = [[] for _ in range(n)]

for each_sent in tqdm(all_sents):
  temp_dictionary = {}
  inputs = tokenizer(each_sent, return_tensors="pt").to(device)
  outputs = model(**inputs)
  logits = outputs.logits
  probs = logits.softmax(dim=1) # we can take the softmax to get the label probabilities

  topk = torch.topk(logits, 8).indices # out of index error with numbers>8 so this model calculates 8 features
  for each_value in topk[0]:
    temp_dictionary[model.config.id2label[each_value.item()]] = probs[0][each_value.item()].item()

  new_df = new_df.append(temp_dictionary, ignore_index = True)

In [None]:
idl = X['id'].tolist()
new_df['id'] = idl
new_df

In [None]:
# save idl in a csv file
id_df = pd.DataFrame(idl, columns=['id'])
id_df.to_csv('id.csv')

id_df

In [None]:
X = new_df
X.drop(['id'], axis = 1, inplace = True)
# saving only the text features X
X.to_csv('text_features.csv')
X

In [None]:
y = pd.DataFrame(y, columns = ['labels'])

# saving the labels
y.to_csv('labels.csv')
y

In [None]:
# FOR EXAMPLES FOR PPT
X_text = X
y_text = y

# Image data feature extraction

In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od
import pandas as pd

# use your api key for the step below

od.download("https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-small")

In [None]:
from transformers import AutoImageProcessor, ResNetForImageClassification, AutoModelForImageClassification, AutoFeatureExtractor
import torch
from PIL import Image

temp_idl = idl

temp_list_of_dictionaries = []
temp_dictionary = {}

# get the model for image feature extraction
extractor = AutoFeatureExtractor.from_pretrained("aalonso-developer/vit-base-clothing-leafs-example-full-simple_highres")
model = AutoModelForImageClassification.from_pretrained("aalonso-developer/vit-base-clothing-leafs-example-full-simple_highres")
model = model.to(device)

# calculate the features for all images

for i in tqdm(range(len(temp_idl))):

  temp_dictionary = {}
  image_name = "fashion-product-images-small/images/"+str(temp_idl[i])+".jpg"
  image = Image.open(image_name)
  # im = np.asarray(image.convert('RGB')).astype('float32') / 255.0
  im = np.asarray(image.convert('RGB'))

  inputs = extractor(im, return_tensors="pt").to(device)

  with torch.no_grad():
      logits = model(**inputs).logits

  probs = logits.softmax(dim=1) # we can take the softmax to get the label probabilities
  topk = torch.topk(logits, 2).indices
  for each_value in topk[0]:
    # print("Prediciton: ", model.config.id2label[each_value.item()])
    # print("Probability: ", probs[0][each_value.item()].item())
    temp_dictionary[model.config.id2label[each_value.item()]] = probs[0][each_value.item()].item()

  temp_list_of_dictionaries.append(temp_dictionary)


In [None]:
X_image = pd.DataFrame(temp_list_of_dictionaries)
X_image = X_image.fillna(0)
X_image

In [None]:
# save the image features separately
X_image.to_csv('image_features.csv')

In [None]:
X_all_features = pd.concat([X_text, X_image], axis=1)
X_all_features

In [None]:
# save all feature
X_all_features.to_csv('all_features.csv')

# Download all result files

In [None]:
from google.colab import files
files.download("text_features.csv")
files.download("image_features.csv")
files.download("all_features.csv")
files.download("labels.csv")
files.download("id.csv")
files.download("X_for_text_explanation.csv")