In [1]:
##importing libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
##loading the articles dataset
articles=pd.read_csv('articles.csv',
                    usecols=['prod_name','product_type_name','product_group_name','graphical_appearance_name',
                              'colour_group_name','perceived_colour_value_name'])
articles.head()

Unnamed: 0,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name
0,Strap top,Vest top,Garment Upper body,Solid,Black,Dark
1,Strap top,Vest top,Garment Upper body,Solid,White,Light
2,Strap top (1),Vest top,Garment Upper body,Stripe,Off White,Dusty Light
3,OP T-shirt (Idro),Bra,Underwear,Solid,Black,Dark
4,OP T-shirt (Idro),Bra,Underwear,Solid,White,Light


In [3]:
##creating dictionaries for categories in 'product_group_name' and 'product_type_name'
type_dict={v:k for k,v in dict(enumerate(articles['product_type_name'].unique())).items()}
group_dict={v:k for k,v in dict(enumerate(articles['product_group_name'].unique())).items()}

##creating the reverse of these dictionaries for final decoding
reverse_type_dict={v:k for k,v in type_dict.items()}
reverse_group_dict={v:k for k,v in group_dict.items()}

##replacing values in articles with placeholders
articles['product_type_name']=articles['product_type_name'].apply(lambda x:str(type_dict[x]))
articles['product_group_name']=articles['product_group_name'].apply(lambda x:str(group_dict[x]))

##checking the head of the dataset
articles.head()

Unnamed: 0,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name
0,Strap top,0,0,Solid,Black,Dark
1,Strap top,0,0,Solid,White,Light
2,Strap top (1),0,0,Stripe,Off White,Dusty Light
3,OP T-shirt (Idro),1,1,Solid,Black,Dark
4,OP T-shirt (Idro),1,1,Solid,White,Light


In [4]:
def make_df(ordered_cols):
    df=articles
    df['input_text']=df['prod_name']
    ##adding attributes to the product name to generate queries
    ##reversing ordered_cols
    ordered_cols.reverse()
    for col in ordered_cols:
        df['input_text']=df[col]+' '+df['input_text']
    ##combining product_group_name and product_type_name into a single output column
    df['target_text']=df['product_group_name']+' '+df['product_type_name']
    ##creating the prefix column
    df['prefix']='translate input_text to target_text'
    ##reduce to the input and output columns
    df=df[['prefix','input_text','target_text']]
    ##reducing all text to lowercase
    for col in df.columns:
        df[col]=df[col].apply(lambda x:x.lower())
    ##return the created dataframe
    return df

In [5]:
##creating combinations
set_1=make_df(['graphical_appearance_name'])
set_2=make_df(['colour_group_name'])
set_3=make_df(['perceived_colour_value_name'])
set_4=make_df(['graphical_appearance_name','colour_group_name'])
set_5=make_df(['perceived_colour_value_name','colour_group_name'])
set_6=make_df(['graphical_appearance_name','perceived_colour_value_name','colour_group_name'])

##deleting the articles dataset
##del(articles)

##concatenating all dataframes into a single one
data=pd.concat([set_1,set_2,set_3,set_4,set_5,set_6])

#resetting index
data.reset_index(inplace=True)
data.drop('index',axis=1,inplace=True)

##deleting all the sets
del(set_1,set_2,set_3,set_4,set_5,set_6)

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

Unnamed: 0,prefix,input_text,target_text
0,translate input_text to target_text,solid strap top,0 0
1,translate input_text to target_text,solid strap top,0 0
2,translate input_text to target_text,stripe strap top (1),0 0
3,translate input_text to target_text,solid op t-shirt (idro),1 1
4,translate input_text to target_text,solid op t-shirt (idro),1 1


In [6]:
def jumble_words(sent):
    sent_list=sent.split(' ')
    np.random.shuffle(sent_list)
    return ' '.join(sent_list)

data['input_text']=data['input_text'].apply(lambda x:jumble_words(x))

In [7]:
##transformer model imports
import logging
from simpletransformers.t5 import T5Model,T5Args

In [8]:
##<need to understand this part>
logging.basicConfig(level=logging.INFO)
transformers_logger=logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [9]:
##maximum sentence length
max_seq_len=max(data['input_text'].apply(lambda x:len(x.split())))

In [10]:
##creating a data subset for reducing training time
from sklearn.model_selection import ShuffleSplit
ss=ShuffleSplit(train_size=0.1,random_state=42)
subset=data.iloc[next(ss.split(data))[0]]

In [11]:
subset.head()

Unnamed: 0,prefix,input_text,target_text
389217,translate input_text to target_text,tank+ loose black conscious audrey solid,0 0
410171,translate input_text to target_text,light dress beige janeiro solid pow,11 25
462739,translate input_text to target_text,bandana 2 red bree bright cool,4 47
40556,translate input_text to target_text,floyd print funnelneck front,0 5
559780,translate input_text to target_text,tvp trouser dark melange willow green dark,3 7


In [12]:
##specifying model arguments
model_args = T5Args()
model_args.max_seq_length=max_seq_len
model_args.train_batch_size=128
model_args.eval_batch_size=128
model_args.num_train_epochs=1
##model_args.evaluate_during_training = True
##model_args.evaluate_during_training_steps = 30000
model_args.use_multiprocessing=False
model_args.fp16=False
model_args.save_steps=-1
model_args.save_eval_checkpoints=False
model_args.no_cache=True
model_args.reprocess_input_data=True
model_args.overwrite_output_dir=True
model_args.preprocess_inputs=False
model_args.num_return_sequences=1

##instantiating the model
model = T5Model("t5","t5-base",args=model_args,use_cuda=False)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


##creating make-do train and eval sets
train_set=data.iloc[:80]
test_set=data.iloc[80:100]

In [13]:
model.train_model(subset)

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/63325 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

INFO:simpletransformers.t5.t5_model: Training started


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/495 [00:00<?, ?it/s]

INFO:simpletransformers.t5.t5_model: Training of t5-base model complete. Saved to outputs/.


(495, 0.8679719136218832)

In [24]:
def get_hier():
    query=input()
    cat,subcat=[int(i) for i in model.predict(query)[0].split()]
    return {'product_group_name':reverse_group_dict[int(cat)],
           'product_type_name':reverse_type_dict[int(subcat)]}

In [25]:
get_hier()

black shirt


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Upper body', 'product_type_name': 'Shirt'}

In [26]:
get_hier()

skirt


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Lower body', 'product_type_name': 'Skirt'}

In [27]:
get_hier()

kurta


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Upper body', 'product_type_name': 'T-shirt'}

In [28]:
get_hier()

kurti


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Upper body', 'product_type_name': 'Jacket'}

In [29]:
get_hier()

sport shoes


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Shoes', 'product_type_name': 'Sneakers'}

In [30]:
get_hier()

black jacket for men


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Upper body', 'product_type_name': 'Vest top'}

In [31]:
get_hier()

denims


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Lower body', 'product_type_name': 'Trousers'}

In [32]:
get_hier()

red dress


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Full body', 'product_type_name': 'Dress'}

In [33]:
get_hier()

saree for women


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Upper body', 'product_type_name': 'T-shirt'}

In [34]:
get_hier()

party wear men


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Lower body', 'product_type_name': 'Trousers'}

In [35]:
get_hier()

kurtas


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Garment Upper body', 'product_type_name': 'T-shirt'}

In [36]:
get_hier()

formal shoes for men


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

{'product_group_name': 'Shoes', 'product_type_name': 'Other shoe'}