# Import Libraries

In [3]:
import pandas as pd
import openai
import os

openai.organization = os.environ.get('OPENAI_ORG')
openai.api_key = os.environ.get('OPENAI_API_KEY')

# Dataframe Preparation

In [30]:
# load full data
amazon = pd.read_csv("data/Reviews_withURL.csv")
scrap = pd.read_csv("results/combined/20230415_094354_scrap_results_combined.csv")

  scrap = pd.read_csv("results/combined/20230415_094354_scrap_results_combined.csv")


In [None]:
amazon['Review'] = amazon['Summary'] + ": " + amazon['Text']

In [None]:
amazon_full = pd.merge(amazon, scrap, how='left')  # join tables
df = amazon_full[['Review', 'ProductCategories_1']]  # select relevant columns
df = df[df['ProductCategories_1'].notna()]  # drop missing values
df = df.drop_duplicates().reset_index(drop=True)  # drop duplicates
df.columns = ['prompt', 'completion']
df.shape

(349621, 2)

In [None]:
# drop category with less than min_occurrence
min_occurrence = 10
df_count = df['completion'].value_counts()
subset_df = df[df['completion'].isin(df_count[df_count > min_occurrence].index)]
subset_df = subset_df.groupby('completion').head(1000)
subset_df.shape

(8824, 2)

In [None]:
subset_df.to_json("data/review_product_category.jsonl", orient="records", lines=True)

# Data Preparation Tool

Fine tuning reference: https://github.com/openai/openai-cookbook/blob/main/examples/Fine-tuned_classification.ipynb

In [None]:
!openai tools fine_tunes.prepare_data -f data/review_product_category.jsonl -q

Analyzing...

- Your file contains 8824 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for

# Fine Tune

In [None]:
!openai api fine_tunes.create -t "data/review_product_category_prepared_train.jsonl" -v "data/review_product_category_prepared_valid.jsonl" --compute_classification_metrics --classification_n_classes 22 -m ada

Uploaded file from data/review_product_category_prepared_train.jsonl: file-xnwxKWa6iajNvMwxn0pTOsK9
Uploaded file from data/review_product_category_prepared_valid.jsonl: file-sG7fSIdUn12OMT6h8cUqiwQP
Created fine-tune: ft-a4QcUayEegXpPTLdXZnm7daO
Streaming events until fine-tuning is complete...



Upload progress:   0%|          | 0.00/4.34M [00:00<?, ?it/s]
Upload progress: 100%|██████████| 4.34M/4.34M [00:00<00:00, 4.31Git/s]

Upload progress:   0%|          | 0.00/565k [00:00<?, ?it/s]
Upload progress: 100%|██████████| 565k/565k [00:00<00:00, 276Mit/s]



(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-04-15 10:04:10] Created fine-tune: ft-a4QcUayEegXpPTLdXZnm7daO
[2023-04-15 10:04:29] Fine-tune costs $1.55
[2023-04-15 10:04:30] Fine-tune enqueued. Queue number: 0



In [45]:
import requests

fine_tune_id = "ft-a4QcUayEegXpPTLdXZnm7daO"

headers = {
    'Authorization': 'Bearer ' + os.getenv('OPENAI_API_KEY', ''),
}

response = requests.get(f'https://api.openai.com/v1/fine-tunes/{fine_tune_id}/events', headers=headers)
response_df = pd.DataFrame(response.json()['data'])
response_df['created_at'] = pd.to_datetime(response_df['created_at'], unit='s')
response_df[['created_at', 'message']]

Unnamed: 0,created_at,message
0,2023-04-15 02:04:10,Created fine-tune: ft-a4QcUayEegXpPTLdXZnm7daO
1,2023-04-15 02:04:29,Fine-tune costs $1.55
2,2023-04-15 02:04:30,Fine-tune enqueued. Queue number: 0
3,2023-04-15 02:24:33,Fine-tune started
4,2023-04-15 02:31:04,Completed epoch 1/4
5,2023-04-15 02:44:35,Completed epoch 3/4
6,2023-04-15 02:52:12,Uploaded model: ada:ft-personal-2023-04-15-02-...
7,2023-04-15 02:52:14,Uploaded result file: file-FwR6RxCuXHI6BPlNDIs...
8,2023-04-15 02:52:14,Fine-tune succeeded


In [52]:
response_df.iloc[6]['message']

'Uploaded model: ada:ft-personal-2023-04-15-02-52-12'

# Model Performance

In [46]:
!openai api fine_tunes.results -i ft-a4QcUayEegXpPTLdXZnm7daO > data/result.csv

In [47]:
results = pd.read_csv('data/result.csv')
results[results['classification/accuracy'].notnull()].tail(1)

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy,classification/accuracy,classification/weighted_f1_score
3913,3914,9994576,31312,0.014639,0.75,0.826087,,,,0.81,0.79736


# Predict

Predict the product category based on product's review

In [223]:
scrap = pd.read_csv("results/combined/scrap_final_result.csv")

# filter url that the category needs to be predict
url_to_predict = scrap[scrap['ProductMajorCategory'].isna()]['ProductURL']

# consider review with dead urls
review_to_predict = amazon[amazon['ProductURL'].isin(url_to_predict)]

# sort review based on helpfulness percentage
review_to_predict['Helpfulness'] = review_to_predict['HelpfulnessNumerator'] / review_to_predict['HelpfulnessDenominator']
review_to_predict = review_to_predict.sort_values('Helpfulness', ascending=False)

# take the top three most helpful review to predict product category
review_to_predict = review_to_predict.groupby('ProductURL')['Text'].apply(lambda x: x.head(3).str.cat(sep='\n')).reset_index()
review_to_predict

  scrap = pd.read_csv("results/combined/scrap_final_result.csv")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_to_predict['Helpfulness'] = review_to_predict['HelpfulnessNumerator'] / review_to_predict['HelpfulnessDenominator']


Unnamed: 0,ProductURL,Text
0,https://www.amazon.com/dp/2841233731,This book is easy to read and the ingredients ...
1,https://www.amazon.com/dp/B0000CER0J,This is the best Earl Grey I've ever had--so i...
2,https://www.amazon.com/dp/B0000CERCD,This tea is very delicious and flavorful. The...
3,https://www.amazon.com/dp/B0000CFH1O,Wilton has amazing products. The color of thi...
4,https://www.amazon.com/dp/B0000CFNGU,"This frosting is great, especially if you want..."
...,...,...
4545,https://www.amazon.com/dp/B0099X2AE8,IT HAS THE BEST CRUNCH<br /><br />I LOVE IT IT...
4546,https://www.amazon.com/dp/B009AH958W,"If you like Indian spices and peanuts, you'll ..."
4547,https://www.amazon.com/dp/B009AVCXVY,The product description is very accurate. I r...
4548,https://www.amazon.com/dp/B009NTCOFI,This review is for the boneless ham. A little ...


In [224]:
# find out max words of product category
subset_df['completion'].apply(lambda x: len(x.split(' '))).max()

5

In [225]:
# sample review
sample_review = review_to_predict['Text'].iloc[333]
sample_review

'I love fun dip, it was one of my favorite candies as a kid. But this time, I chipped my tooth while eating one of the dip sticks. Just be sure you have licked it alot before you bite down on it. I still love fun dip, but my teeth arent what they used to be as a kid. Just be careful!'

In [226]:
ft_model = 'ada:ft-personal-2023-04-15-02-52-12'
res = openai.Completion.create(model=ft_model, prompt=sample_review + ' ->', max_tokens=5, temperature=0)
res['choices'][0]['text']

' Grocery & Gourmet'

In [227]:
res

<OpenAIObject text_completion id=cmpl-75YOepm0weZbODBDLLTSzymKV6Ao3 at 0x1d4f499dbd0> JSON: {
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": " Grocery & Gourmet"
    }
  ],
  "created": 1681558216,
  "id": "cmpl-75YOepm0weZbODBDLLTSzymKV6Ao3",
  "model": "ada:ft-personal-2023-04-15-02-52-12",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 5,
    "prompt_tokens": 75,
    "total_tokens": 80
  }
}

Let's loop!

`tiktoken`: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

In [228]:
from tqdm import tqdm
import time
import tiktoken
encoding = tiktoken.encoding_for_model("text-embedding-ada-002")

In [231]:
sample_review = "tiktoken is great!"
sample_review_encode = encoding.encode(sample_review)
print(len(sample_review_encode))
print(encoding.decode(sample_review_encode[:3]))

6
tiktoken


In [232]:
ft_model = 'ada:ft-personal-2023-04-15-02-52-12'
list_to_iter = list(review_to_predict.itertuples())

predict_result = []
for row in tqdm(list_to_iter):
    # delay (limitation from openai)
    time.sleep(1)

    url = row.ProductURL
    review = row.Text

    # if the review too long, cut the embedding
    thresh = 1900
    review_encode = encoding.encode(review)
    if len(review_encode) >= thresh:
        review = encoding.decode(review_encode[:thresh])

    # predict
    res = openai.Completion.create(
        model=ft_model,
        prompt=review + ' ->',
        max_tokens=5,
        temperature=0
    )
    label = res['choices'][0]['text']

    row = {
        'ProductURL': url,
        'Text': review,
        'ProductCategories_1_raw_prediction': label
    }

    # save to temporary list, just in case
    predict_result.append(row)

    # immediately save to csv
    csv_file_path = "results/openai-predict/url_review_category.csv"
    result_df = pd.read_csv(csv_file_path)
    result_df = pd.concat([result_df, pd.DataFrame([row])]).reset_index(drop=True)
    result_df.to_csv(csv_file_path, index=False)

100%|██████████| 4550/4550 [2:22:23<00:00,  1.88s/it]  


Map the prediction result into proper (existing) product category

In [233]:
import pandas as pd
from thefuzz import process, fuzz

# read scraping result
scrap_result = pd.read_csv("results/combined/20230415_094354_scrap_results_combined.csv")
unique_category_list = scrap_result['ProductCategories_1'].unique()

# read openai prediction
predict_result = pd.read_csv("results/openai-predict/url_review_category.csv")

# create mapping
mapping_category = {}
for category in predict_result['ProductCategories_1_raw_prediction'].unique():
    # try several approaches
    result_list = [
        process.extractOne(category, unique_category_list),
        process.extractOne(category, unique_category_list, scorer=fuzz.token_sort_ratio)
    ]

    # get best match
    sim_category, sim_score = max(result_list, key=lambda x: x[1] if x is not None else 0)

    # save mapping
    mapping_category[category] = sim_category

# mapping
predict_result['ProductCategories_1_prediction'] = predict_result['ProductCategories_1_raw_prediction'].apply(lambda x: mapping_category[x])

# save to csv
predict_result.to_csv("results/openai-predict/url_review_category.csv", index=False)


  scrap_result = pd.read_csv("results/combined/20230415_094354_scrap_results_combined.csv")


Final result: merge the scrap result with prediction result

In [234]:
scrap = pd.read_csv("results/combined/20230415_094354_scrap_results_combined.csv")
predict_result = pd.read_csv("results/openai-predict/url_review_category.csv")
predict_result = predict_result.drop(columns=['Text'])
final_result = pd.merge(scrap, predict_result, how="left", on="ProductURL")
final_result.insert(
    loc=3,
    column='ProductMajorCategory',
    value=final_result['ProductCategories_1'].fillna(final_result['ProductCategories_1_prediction'])
)
final_result.to_csv("results/combined/scrap_final_result.csv", index=False)

  scrap = pd.read_csv("results/combined/20230415_094354_scrap_results_combined.csv")


Sanity check

In [4]:
final_result = pd.read_csv("results/combined/scrap_final_result.csv")
# no missing values
final_result[final_result['ProductMajorCategory'].isna()]

  final_result = pd.read_csv("results/combined/scrap_final_result.csv")


Unnamed: 0,ProductURL,ProductTitle,ProductBrand,ProductMajorCategory,ProductCategories_1,ProductCategories_2,ProductCategories_3,ProductCategories_4,ProductCategories_5,ProductCategories_6,ProductCategories_7,ProductCategories_8,ProductCategories_1_raw_prediction,ProductCategories_1_prediction
