## Review Correlation Analysis - LLM

In [None]:
!pip install transformers datasets scikit-learn pandas numpy matplotlib tqdm evaluate

In [None]:
!pip install numpy==1.21.5

In [None]:
pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118

In [1]:
import transformers
from transformers import pipeline
from datasets import load_dataset
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import torch

import evaluate
import os
import json

In [2]:
!nvidia-smi

Thu Apr 11 05:19:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:01:00.0 Off |                    0 |
| N/A   37C    P0              72W / 500W |   6722MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
classes = ["Books", "Music", "Video", "Toys", "Tools", "Office Products",
        "Electronics", "Kitchen", "Sports", "Shoes",
        "Health & Personal Care"]

id2label = {0: "Books", 1: "Music", 2: "Video", 3: "Toys", 4: "Tools", 
        5: "Office Products", 6: "Electronics", 7: "Kitchen", 8: "Sports",
        9: "Shoes", 10: "Health & Personal Care"}

label2id = {"Books": 0, "Music": 1, "Video":2, "Toys": 3, "Tools": 4, "Office Products": 5,
        "Electronics": 6, "Kitchen": 7, "Sports": 8, "Shoes": 9,
        "Health & Personal Care": 10}

In [4]:
# load test dataset
data_files = {
        "test": 'amazon_prod_review_cls_test.json'    # test json path
    }
    
dataset = load_dataset("json", data_files=data_files)

test_dataset = dataset['test']

### define LLM correlation anaylsis pipeline

In [5]:
import torch
import torch.nn.functional as F

def pipeline_corr_eval_llm(pipe, data, template, label2id):
    res_dict = {}
    correct = [0 for _ in range(11)]
    correct_top2 = [0 for _ in range(11)]
    correct_top3 = [0 for _ in range(11)]
    
    total = [0 for _ in range(11)]
    preds = []
    labels = []
    
    incorrect = []
    fp = [0 for _ in range(11)]
    fn = [0 for _ in range(11)]

    start_time = time.time()
    for i, row in tqdm(enumerate(data), total=len(data)):
        text = row['text']
        label = int(row['label'])
        prompt = template.format(text = text)
        result = pipe(prompt)[0]

        # Extracting prediction types from generated_text
        pred_types_raw = result['generated_text']
    
        # Keep only the part after ':' for each prediction type and join them
        pred_types_joined = ', '.join([pt.split(':', 1)[1].strip() if ':' in pt else pt for pt in pred_types_raw.split(', ')])

        # Now split the joined string by ','
        pred_types = pred_types_joined.split(', ')

        pred_ids = []
        
        # get rid of pred_type which is not in the label_space
        for pt in pred_types:
            try:
                pred_id = int(label2id[pt])
                pred_ids.append(pred_id)
            except KeyError:
                pred_ids.append(99)

        
        result['Id'] = row['Id']
        
        res_dict[i] = {'Id': row['Id'], 'pred':result['generated_text']}
        
        preds.append(pred_ids)
        labels.append(label)
        
        total[label] += 1
        # Check for correct predictions
        # Check for correct top 1 prediction
        if pred_ids and label == pred_ids[0]:
            correct[label] += 1

        # Check for correct top 2 predictions
        if len(pred_ids) > 1 and label in pred_ids[:2]:
            correct_top2[label] += 1

        # Check for correct top 3 predictions
        if len(pred_ids) > 2 and label in pred_ids[:3]:
            correct_top3[label] += 1
        
        # if label not in pred_ids[:3]:
        #     d = {"pred": [id2label[pred] for pred in list(correct_top3)], 
        #            "label": label, "review": text, "probability": str(probs)}
        #     incorrect.append(d)
        
        if label != pred_ids[0]:
            fp[pred_ids[0]] += 1
            fn[label] += 1
    
    end_time = time.time()
    runtime = end_time - start_time
    
    out_dict = {}
    out_dict['res_corr'] = res_dict
    out_dict['correct_corr'] = correct
    out_dict['correct_corr_top2'] = correct_top2
    out_dict['correct_corr_top3'] = correct_top3
    out_dict['total_corr'] = total
    out_dict['runtime_corr'] = runtime
    out_dict['preds'] = preds
    out_dict['labels'] = labels
    out_dict['incorrect'] = incorrect
    out_dict['fp'] = fp
    out_dict['fn'] = fn
    # return res_dict, correct, correct_top2, correct_top3, total, runtime, preds, labels, incorrect
    return out_dict




### zero-shot inference on FLAN-T5

In [6]:
# prompt
task_name = 'multi labels classification'
label_space = ["Books", "Music", "Video", "Toys", "Tools", "Office Products", "Electronics", "Kitchen", "Sports", "Shoes", "Health & Personal Care"]
task_definition = f'Select the three categories that best fit the content of the review from {label_space}.'
# output_format = "top1, top2, top3"

example_output = "Customer review: 'I loved this thriller novel! It kept me on the edge of my seat.'\nOutput: Books, Electronics, Sports"

template = f"Please perform {task_name} task.\n{task_definition}\nExample: \n{example_output}\n\nCustomer review:\n" + "{text}."

print(template)

Please perform multi labels classification task.
Select the three categories that best fit the content of the review from ['Books', 'Music', 'Video', 'Toys', 'Tools', 'Office Products', 'Electronics', 'Kitchen', 'Sports', 'Shoes', 'Health & Personal Care'].
Example: 
Customer review: 'I loved this thriller novel! It kept me on the edge of my seat.'
Output: Books, Electronics, Sports

Customer review:
{text}.


In [7]:
# Flan-T5
pipeFlanT5 = pipeline(
    "text2text-generation", 
    model="google/flan-t5-xxl", 
    # device='cuda'
)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
import gc

# delete the pipeFlanT5 and free up some memory
del pipeFlanT5
#del pipeFlanUL2
gc.collect()
torch.cuda.empty_cache()

NameError: name 'pipeFlanT5' is not defined

In [8]:
indices_to_test = [101, 201, 301, 401, 501]

for index in range(53, 55):
    example = test_dataset[index]['text']
    actual_label_id = test_dataset[index]['label']
    # turn the label into the string according to id2label
    actual_label_name = id2label[actual_label_id]
    start_time = time.time()
    prompt = template.format(text=example)
    output = pipeFlanT5(prompt)
    end_time = time.time()

    print(f'Index: {index}')
    print(f'Input: {example}')
    print(f'Actual Label: {actual_label_name}')
    # print(f'Prompt: {prompt}')
    print(f'Output: {output}')
    print(f'Runtime: {end_time - start_time}\n')

Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors


Index: 53
Input: This is the remixes for Madonna's newest single \\"American Life\\" and  it vastly improves on the original version. The mixes range from hip hop to trance/electronica to flat out house music. The best mixes of the song are by Missy Elliott,Paul Oakenfold and Part 2 of the Peter Rauhofer American Anthem mix. The Missy Elliott American Dream mix(4:49) is typical Missy and that is saying a lot becuase she is so much a genius. The Paul Oakenfold Downtempo mix(6:32) is the best mix here. It gives Madonna the chance like \\"Justify My Love\\" and  \\"Erotica\\" to get more r&b/rap influenced than normally done. It is very different from his normal type of mix but it works. The Felix Da Housecat Devin Dazzle Mix(6:10) is crap. It tries to move the vocals to a more upbeat type flow and makes her sound weird. The Peter Rauhofer part 1 American Anthem mix(10:41) is electro and does the same vocally as the Felix Da Housecat Mix with a tad better results. His part 2 Mix(9:06) is 

In [11]:
if not os.path.exists("./FlanT5_eval_results_task2.json"):
    flant5_out = pipeline_corr_eval_llm(pipeFlanT5, test_dataset, template, label2id)
    with open("./FlanT5_eval_results_task2.json", "w") as f:
        json.dump(flant5_out,f)
    print('=> save results to flant5_eval_results_task2.json')
else:
    with open("./FlanT5_eval_results_task2.json", "r") as f:
        data = json.load(f)
        FlanT5_res_corr = data['res_corr']
        FlanT5_correct_corr = data['correct_corr']
        FlanT5_correct_corr_top2 = data['correct_corr_top2']
        FlanT5_correct_corr_top3 = data['correct_corr_top3']
        FlanT5_total_corr = data['total_corr']
        FlanT5_runtime_corr = data['runtime_corr']
        FlanT5_preds = data['preds']
        FlanT5_labels = data['labels']
        Flant5_incorrect = data['incorrect']
        Flant5_fp = data['fp']
        Flant5_fn = data['fn']
    print('=> load results from FlanT5_eval_results_task2.json')
    

=> load results from FlanT5_eval_results_task2.json


In [25]:
import numpy as np

correct_corr = flant5_out['correct_corr']  
fp = flant5_out['fp']  
fn = flant5_out['fn'] 

precision_per_class = {}
recall_per_class = {}

for category in range(11): 
    TP = correct_corr[category]
    FP = fp[category]
    FN = fn[category]
    
    precision = TP / (TP + FP)
    recall = TP / (TP + FN) 
    
    precision_per_class[category] = precision
    recall_per_class[category] = recall

for category in range(11):
    print(f"Category {category} - Precision: {precision_per_class[category]:.2f}, Recall: {recall_per_class[category]:.2f}")


Category 0 - Precision: 0.96, Recall: 1.00
Category 1 - Precision: 0.96, Recall: 0.98
Category 2 - Precision: 0.98, Recall: 1.00
Category 3 - Precision: 0.85, Recall: 0.92
Category 4 - Precision: 0.47, Recall: 0.82
Category 5 - Precision: 0.77, Recall: 0.54
Category 6 - Precision: 0.38, Recall: 0.82
Category 7 - Precision: 0.74, Recall: 0.40
Category 8 - Precision: 1.00, Recall: 0.12
Category 9 - Precision: 0.98, Recall: 0.98
Category 10 - Precision: 0.59, Recall: 0.32


In [12]:
# calculate the top-1 accuracy
for key,val in id2label.items():
    print(f'==> {val} Accuracy: {FlanT5_correct_corr[int(key)]/FlanT5_total_corr[int(key)]}')
print(f'==> Total Accuracy: {np.sum(FlanT5_correct_corr) / np.sum(FlanT5_total_corr)}')
print(f'==> Total Runtime: {FlanT5_runtime_corr}')

==> Books Accuracy: 1.0
==> Music Accuracy: 0.98
==> Video Accuracy: 1.0
==> Toys Accuracy: 0.92
==> Tools Accuracy: 0.82
==> Office Products Accuracy: 0.54
==> Electronics Accuracy: 0.82
==> Kitchen Accuracy: 0.4
==> Sports Accuracy: 0.12
==> Shoes Accuracy: 0.98
==> Health & Personal Care Accuracy: 0.32
==> Total Accuracy: 0.7181818181818181
==> Total Runtime: 7028.367094993591


In [11]:
# calculate the top-2 accuracy
for key,val in id2label.items():
    print(f'==> {val} Accuracy: {FlanT5_correct_corr_top2[int(key)]/FlanT5_total_corr[int(key)]}')
print(f'==> Total Accuracy: {np.sum(FlanT5_correct_corr_top2) / np.sum(FlanT5_total_corr)}')
print(f'==> Total Runtime: {FlanT5_runtime_corr}')

==> Books Accuracy: 0.96
==> Music Accuracy: 1.0
==> Video Accuracy: 0.92
==> Toys Accuracy: 0.98
==> Tools Accuracy: 1.0
==> Office Products Accuracy: 0.68
==> Electronics Accuracy: 0.96
==> Kitchen Accuracy: 0.78
==> Sports Accuracy: 0.42
==> Shoes Accuracy: 1.0
==> Health & Personal Care Accuracy: 0.64
==> Total Accuracy: 0.8490909090909091
==> Total Runtime: 212.081848859787


In [13]:
# calculate the top-3 accuracy
for key,val in id2label.items():
    print(f'==> {val} Accuracy: {FlanT5_correct_corr_top3[int(key)]/FlanT5_total_corr[int(key)]}')
print(f'==> Total Accuracy: {np.sum(FlanT5_correct_corr_top3) / np.sum(FlanT5_total_corr)}')
print(f'==> Total Runtime: {FlanT5_runtime_corr}')

==> Books Accuracy: 0.96
==> Music Accuracy: 1.0
==> Video Accuracy: 0.92
==> Toys Accuracy: 0.98
==> Tools Accuracy: 1.0
==> Office Products Accuracy: 0.84
==> Electronics Accuracy: 1.0
==> Kitchen Accuracy: 0.8
==> Sports Accuracy: 0.86
==> Shoes Accuracy: 1.0
==> Health & Personal Care Accuracy: 0.68
==> Total Accuracy: 0.9127272727272727
==> Total Runtime: 212.081848859787


## zero-shot inference on FLAN-UL2

In [7]:
# prompt
task_name = 'multi labels classification'
label_space = ["Books", "Music", "Video", "Toys", "Tools", "Office Products", "Electronics", "Kitchen", "Sports", "Shoes", "Health & Personal Care"]
task_definition = f'Select the three categories that best fit the content of the review from {label_space}.'
output_format = "Output format: first label, second label, third label"
example_output = "Customer review: 'I loved this thriller novel! It kept me on the edge of my seat.'\nOutput: Books, Electronics, Video"

template = f"Please perform {task_name} task.\n{task_definition}\n{output_format}\nExample: \n{example_output}\n\nCustomer review:\n" + "{text}."

print(template)

Please perform multi labels classification task.
Select the three categories that best fit the content of the review from ['Books', 'Music', 'Video', 'Toys', 'Tools', 'Office Products', 'Electronics', 'Kitchen', 'Sports', 'Shoes', 'Health & Personal Care'].
Output format: first label, second label, third label
Example: 
Customer review: 'I loved this thriller novel! It kept me on the edge of my seat.'
Output: Books, Electronics, Video

Customer review:
{text}.


In [8]:
# flan-UL2
pipeFlanUL2 = pipeline("text2text-generation", model="google/flan-ul2", device='cuda')

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [35]:
import gc

# delete the pipeFlanT5 and free up some memory
# del pipeFlanT5
# del pipeFlanUL2
gc.collect()
torch.cuda.empty_cache()

In [2]:
!nvidia-smi

Thu Apr 11 04:44:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:01:00.0 Off |                    0 |
| N/A   39C    P0              66W / 500W |      4MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [9]:
indices_to_test = [101, 201, 301, 401, 501]

for index in indices_to_test:
    example = test_dataset[index]['text']
    actual_label_id = test_dataset[index]['label']
    # turn the label into the string according to id2label
    actual_label_name = id2label[actual_label_id]
    start_time = time.time()
    prompt = template.format(text=example)
    output = pipeFlanUL2(prompt)
    end_time = time.time()

    print(f'Index: {index}')
    print(f'Input: {example}')
    print(f'Actual Label: {actual_label_name}')
    # print(f'Prompt: {prompt}')
    print(f'Output: {output}')
    print(f'Runtime: {end_time - start_time}\n')



Index: 101
Input: Julie Andrews and Dick Van Dyke did outstanding jobs in<BR>this timeless Disney classic!Andrews is wonderful as the<BR>kind-hearted yet strict nanny,and Van Dyke's performance<BR>of Burt is very good;Van Dyke's Cockney accent is very well-<BR>performed and convincing!The scenery is colorful and very<BR>well-done,and the performers are superb!David Tomlisin is<BR>perfect as the no-nonsense banker George Banks,and Reginald<BR>Owen is wonderful as the elderly Admiral Boom!And who<BR>can forget songs like the touching&quot;Feed the Birds&quot;or the <BR>lively &quot;Step in Time&quot;?Young and old alike will be entertained<BR>by this film.A must-have for any Disney collection!
Actual Label: Video
Output: [{'generated_text': 'Output: Video, Video, Video'}]
Runtime: 2.044362783432007

Index: 201
Input: I am happy with this purchase. This would truly be a life-saver in a survival scenario. This is a decent multitool, good flashlight and firesteel.
Actual Label: Tools
Output

In [10]:
if not os.path.exists("./FlanUL2_eval_results_task2.json"):
    flanul2_out = pipeline_corr_eval_llm(pipeFlanUL2, test_dataset, template, label2id)
    with open("./FlanUL2_eval_results_task2.json", "w") as f:
        json.dump(flanul2_out,f)
    print('=> save results to FlanUL2_eval_results_task2.json')
else:
    with open("./FlanUL2_eval_results_task2.json", "r") as f:
        data = json.load(f)
        FlanUL2_res_corr = data['res_corr']
        FlanUL2_correct_corr = data['correct_corr']
        FlanUL2_correct_corr_top2 = data['correct_corr_top2']
        FlanUL2_correct_corr_top3 = data['correct_corr_top3']
        FlanUL2_total_corr = data['total_corr']
        FlanUL2_runtime_corr = data['runtime_corr']
        FlanUL2_preds = data['preds']
        FlanUL2_labels = data['labels']
        FlanUL2_incorrect = data['incorrect']
        FlanUL2_fp = data['fp']
        FlanUL2_fn = data['fn']
    print('=> load results from FlanUL2_eval_results_task2.json')


  0%|          | 0/550 [00:00<?, ?it/s]

--- Logging error ---
Traceback (most recent call last):
  File "/shared/centos7/anaconda3/2022.05/lib/python3.9/logging/__init__.py", line 1083, in emit
    msg = self.format(record)
  File "/shared/centos7/anaconda3/2022.05/lib/python3.9/logging/__init__.py", line 927, in format
    return fmt.format(record)
  File "/shared/centos7/anaconda3/2022.05/lib/python3.9/logging/__init__.py", line 663, in format
    record.message = record.getMessage()
  File "/shared/centos7/anaconda3/2022.05/lib/python3.9/logging/__init__.py", line 367, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/shared/centos7/anaconda3/2022.05/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/shared/centos7/anaconda3/2022.05/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/shared/centos7/anaconda3/2022.05/lib/python3.9/site-packages/ipykernel_l

=> save results to FlanUL2_eval_results_task2.json


In [11]:
import numpy as np

correct_corr = flanul2_out['correct_corr']  
fp = flanul2_out['fp']  
fn = flanul2_out['fn'] 

precision_per_class = {}
recall_per_class = {}

for category in range(11): 
    TP = correct_corr[category]
    FP = fp[category]
    FN = fn[category]
    
    precision = TP / (TP + FP)
    recall = TP / (TP + FN) 
    
    precision_per_class[category] = precision
    recall_per_class[category] = recall

for category in range(11):
    print(f"Category {category} - Precision: {precision_per_class[category]:.2f}, Recall: {recall_per_class[category]:.2f}")


Category 0 - Precision: 0.96, Recall: 1.00
Category 1 - Precision: 1.00, Recall: 0.98
Category 2 - Precision: 0.98, Recall: 1.00
Category 3 - Precision: 0.84, Recall: 0.92
Category 4 - Precision: 0.49, Recall: 0.90
Category 5 - Precision: 0.65, Recall: 0.56
Category 6 - Precision: 0.42, Recall: 0.90
Category 7 - Precision: 1.00, Recall: 0.32
Category 8 - Precision: 0.78, Recall: 0.14
Category 9 - Precision: 0.96, Recall: 0.98
Category 10 - Precision: 0.78, Recall: 0.42


In [13]:
# calculate the top-1 accuracy
for key,val in id2label.items():
    print(f'==> {val} Accuracy: {FlanUL2_correct_corr[int(key)]/FlanUL2_total_corr[int(key)]}')
print(f'==> Total Accuracy: {np.sum(FlanUL2_correct_corr) / np.sum(FlanUL2_total_corr)}')
print(f'==> Total Runtime: {FlanUL2_runtime_corr}')

==> Books Accuracy: 0.96
==> Music Accuracy: 1.0
==> Video Accuracy: 0.96
==> Toys Accuracy: 0.98
==> Tools Accuracy: 0.88
==> Office Products Accuracy: 0.56
==> Electronics Accuracy: 0.92
==> Kitchen Accuracy: 0.5
==> Sports Accuracy: 0.24
==> Shoes Accuracy: 1.0
==> Health & Personal Care Accuracy: 0.5
==> Total Accuracy: 0.7727272727272727
==> Total Runtime: 375.2274160385132


In [14]:
# calculate the top-2 accuracy
for key,val in id2label.items():
    print(f'==> {val} Accuracy: {FlanUL2_correct_corr_top2[int(key)]/FlanUL2_total_corr[int(key)]}')
print(f'==> Total Accuracy: {np.sum(FlanUL2_correct_corr_top2) / np.sum(FlanUL2_total_corr)}')
print(f'==> Total Runtime: {FlanUL2_runtime_corr}')

==> Books Accuracy: 0.96
==> Music Accuracy: 1.0
==> Video Accuracy: 0.98
==> Toys Accuracy: 0.98
==> Tools Accuracy: 0.94
==> Office Products Accuracy: 0.92
==> Electronics Accuracy: 0.96
==> Kitchen Accuracy: 0.62
==> Sports Accuracy: 0.58
==> Shoes Accuracy: 1.0
==> Health & Personal Care Accuracy: 0.62
==> Total Accuracy: 0.8690909090909091
==> Total Runtime: 375.2274160385132


In [15]:
# calculate the top-3 accuracy
for key,val in id2label.items():
    print(f'==> {val} Accuracy: {FlanUL2_correct_corr_top3[int(key)]/FlanUL2_total_corr[int(key)]}')
print(f'==> Total Accuracy: {np.sum(FlanUL2_correct_corr_top3) / np.sum(FlanUL2_total_corr)}')
print(f'==> Total Runtime: {FlanUL2_runtime_corr}')

==> Books Accuracy: 0.98
==> Music Accuracy: 1.0
==> Video Accuracy: 0.98
==> Toys Accuracy: 0.98
==> Tools Accuracy: 1.0
==> Office Products Accuracy: 0.96
==> Electronics Accuracy: 0.98
==> Kitchen Accuracy: 0.64
==> Sports Accuracy: 0.84
==> Shoes Accuracy: 1.0
==> Health & Personal Care Accuracy: 0.68
==> Total Accuracy: 0.9127272727272727
==> Total Runtime: 375.2274160385132


### Results
|Model|top-1 acc|top-2 acc|top-3 acc|
|-|-|-|-|
|Flan-T5|0.724|0.849|0.913|
|Flan-UL2|0.773|0.869|0.913|