In [1]:
import argparse
import json

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from fastchat.conversation import get_default_conv_template, compute_skip_echo_len
from fastchat.serve.inference import load_model


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = '/data/jason.tangth/model/vicuna_data/vicuna-13b/'

In [3]:
model, tokenizer = load_model(
    model_path='/data/jason.tangth/model/vicuna_data/vicuna-13b/',
    device='cuda',
    num_gpus='1',
    max_gpu_memory='28Gib',
    load_8bit=False,
    debug=False,
    )

init_kwargs {'torch_dtype': torch.float16}


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:41<00:00, 13.73s/it]


In [6]:
msg = '''
Choose a category for the product "dek lumpur dan tutup mesin beat karbu 2008 - 2012 terbaru!!" from below options:

Motorcycles - Motorcycle Accessories - Mud Flaps & Splash Guards
Motorcycles - Motorcycle Spare Parts - Body & Frame
Motorcycles - Motorcycle Spare Parts - Others
Motorcycles - Motorcycle Accessories - Others
Motorcycles - Motorcycles
Motorcycles - Others
Motorcycles - Motorcycle Accessories - Boxes & Cases
Motorcycles - Motorcycle Accessories - Carpets
Motorcycles - Motorcycle Spare Parts - Cables & Tubes
Automobiles - Automobile Exterior Accessories - Mud Flaps & Splash Guards
Motorcycles - Motorcycle Accessories - Covers
Motorcycles - Motorcycle Spare Parts - Tires & Accessories
Motorcycles - Motorcycle Spare Parts - Drivetrain, Transmission & Clutches - Chains & Gears
Motorcycles - Motorcycle Accessories - Stickers, Logos & Emblems
Motorcycles - Motorcycle Spare Parts - Batteries & Accessories
None of above

Choose a correct answer from above options in English. Do not provide answer not shown above.
'''
conv = get_default_conv_template(model_path).copy()
conv.append_message(conv.roles[0], msg)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

inputs = tokenizer([prompt])
output_ids = model.generate(
    torch.as_tensor(inputs.input_ids).cuda(),
    do_sample=True,
    temperature=0.05,
    max_new_tokens=512,
)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
skip_echo_len = compute_skip_echo_len(model_path, conv, prompt)
outputs = outputs[skip_echo_len:]

# print(f"{conv.roles[0]}: {msg}")
print(f"{outputs}")

Motorcycles - Motorcycle Accessories - Others


##### appendix

In [None]:
! curl http://localhost:8945/v1/chat/completions -H "Content-Type: application/json" -d '{"model": "vicuna-13b-v1.1","messages": [{"role": "user", "content": "Hello!"}]}'

In [None]:
# test on monthly QC
import pandas as pd
import pickle 
df = pd.read_csv('./CSPU_Cluster_Monthly_wrong_category_12_sorted_gpt4_gpt35_gpt35top15_gpt35top30_gpt4top15_gpt35top1510in1.csv')

In [None]:
with open('/data/jason.tangth/codebase/offline-and-online-data-filter-and-category-prediction-v1.9.7/assets/dictionary/data_dict.pkl', 
          'rb') as f:
    data_dict = pickle.load(f)

leaf_to_L1_dict = data_dict['leaf_to_L1_dict']
ID_mapping_dict = data_dict['ID_mapping_dict']
name_mapping_dict = data_dict['name_mapping_dict']

In [None]:
## prompt 1
def get_vicuna_result(title, categories, model_path='/data/jason.tangth/model/vicuna_data/vicuna-13b/'):
    msg = '''
        Choose a category for the product "{product_title}" from below options:
        
        {categories}        
        
        Just output the option.
        '''.format(product_title=title, categories=categories)
    conv = get_default_conv_template(model_path).copy()
    conv.append_message(conv.roles[0], msg)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    inputs = tokenizer([prompt])
    output_ids = model.generate(
        torch.as_tensor(inputs.input_ids).cuda(),
        do_sample=True,
        temperature=0.05,
        max_new_tokens=128,
    )
    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
    skip_echo_len = compute_skip_echo_len(model_path, conv, prompt)
    outputs = outputs[skip_echo_len:]
    return outputs

def get_categories(candidates):
    target_leaf_name = [ID_mapping_dict.get(c).replace('BE - ','') for c in candidates]
    categories = ''
    for i in range(len(target_leaf_name)):
        categories += '{}\n'.format(target_leaf_name[i])
    categories += 'None of above\n'
    return categories

gpt_results = []
for i, row in df.iterrows():
    candidates = eval(row['candidates'].replace(' ',','))
    categories = get_categories(candidates)
    title = row['title']
    r = get_vicuna_result(title, categories)
    gpt_results.append(r)

In [None]:
gpt_results

In [None]:
df['vicuna_top15_results'] = gpt_results

In [None]:
import numpy as np
leaf_node = np.array([i.replace('BE - ', '') for i in name_mapping_dict.keys() if 'BE - ' in i])

In [None]:
rest = df[df['gpt_id']==999999]['vicuna_top15_results'].apply(lambda x:leaf_node[[leaf in x for leaf in leaf_node]])

In [None]:
df.loc[df['gpt_id']==999999, 'vicuna_top15_results'] = rest.apply(lambda x:x[0] if len(x)>0 else '999999').tolist()

In [None]:
df['gpt_id'] = df['vicuna_top15_results'].apply(lambda x:
                                                name_mapping_dict.get(
                                                    'BE - '+x.split('\n')[0],
                                                    999999))

In [None]:
df_local_pic = pd.read_csv('CSPU_Cluster_Monthly_wrong_category_12_sorted_local_gt.csv')
df_local_pic = df_local_pic[df_local_pic['gt_name'].notna()]
df_local_pic['gt_id'] = df_local_pic['gt_id'].astype(int)

In [None]:
gpt_id_col = 'gpt_id'
m_df = df_local_pic.merge(df[['Itemid', gpt_id_col, 'candidates', 'gpt35_top15_results']], on='Itemid')

(m_df[gpt_id_col]==m_df['gt_id']).mean()

In [None]:
df.to_csv('./CSPU_Cluster_Monthly_wrong_category_12_sorted_gpt4_gpt35_gpt35top15_gpt35top30_gpt4top15_gpt35top1510in1_vicuna.csv',
          index=False
         )