In [1]:
import json
import numpy as np
from tqdm import tqdm
from FlagEmbedding import FlagReranker

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
based_reranker = FlagReranker('/project/lt200301-edubot/Capstone-TamTanai/models/bge-reranker-v2-m3', use_fp16=True)

----------using 4*GPUs----------


In [3]:
def run_evaluate(model, dataset):
    correct = 0
    for data in tqdm(dataset):
        input_data = [[data['query'], data['pos'][0]]] + [[data['query'], neg] for neg in data['neg']]
        scores = model.compute_score(input_data, normalize=True)
        correct += np.argmax(scores) == 0
    return correct

In [4]:
n_negative = 1

dataset_name = f'reranker_training_dataset_with_Negative={n_negative}_train.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

train_dataset = []
with open(dataset_path) as file:
    train_dataset = list(file)
train_dataset = [json.loads(data) for data in train_dataset]

dataset_name = f'reranker_training_dataset_with_Negative={n_negative}_validate.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

validate_dataset = []
with open(dataset_path) as file:
    validate_dataset = list(file)
validate_dataset = [json.loads(data) for data in validate_dataset]

model_path = f'/project/lt200301-edubot/Capstone-TamTanai/models/bge-reranker-v2-m3-finetune-with_Negative={n_negative}'
reranker = FlagReranker(model_path, use_fp16=True)

based_train_correct = run_evaluate(based_reranker, train_dataset)
based_validate_correct = run_evaluate(based_reranker, validate_dataset)
finetuned_train_correct = run_evaluate(reranker, train_dataset)
finetuned_validate_correct = run_evaluate(reranker, validate_dataset)
del reranker

print('Based model')
print('Training set')
print(f'Top 1 accuracy: {based_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {based_validate_correct/len(validate_dataset)*100:.02f}')
print()
print('Finetuned model')
print('Training set')
print(f'Top 1 accuracy: {finetuned_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {finetuned_validate_correct/len(validate_dataset)*100:.02f}')

----------using 4*GPUs----------


100%|██████████| 3296/3296 [02:30<00:00, 21.84it/s]
100%|██████████| 825/825 [00:37<00:00, 22.20it/s]
100%|██████████| 3296/3296 [02:27<00:00, 22.28it/s]
100%|██████████| 825/825 [00:37<00:00, 22.18it/s]


Based model
Training set
Top 1 accuracy: 98.67
Validation set
Top 1 accuracy: 98.55

Finetuned model
Training set
Top 1 accuracy: 100.00
Validation set
Top 1 accuracy: 100.00


In [4]:
n_negative = 5

dataset_name = f'reranker_training_dataset_with_Negative={n_negative}_train.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

train_dataset = []
with open(dataset_path) as file:
    train_dataset = list(file)
train_dataset = [json.loads(data) for data in train_dataset]

dataset_name = f'reranker_training_dataset_with_Negative={n_negative}_validate.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

validate_dataset = []
with open(dataset_path) as file:
    validate_dataset = list(file)
validate_dataset = [json.loads(data) for data in validate_dataset]

model_path = f'/project/lt200301-edubot/Capstone-TamTanai/models/bge-reranker-v2-m3-finetune-with_Negative={n_negative}'
reranker = FlagReranker(model_path, use_fp16=True)

based_train_correct = run_evaluate(based_reranker, train_dataset)
based_validate_correct = run_evaluate(based_reranker, validate_dataset)
finetuned_train_correct = run_evaluate(reranker, train_dataset)
finetuned_validate_correct = run_evaluate(reranker, validate_dataset)
del reranker

print('Based model')
print('Training set')
print(f'Top 1 accuracy: {based_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {based_validate_correct/len(validate_dataset)*100:.02f}')
print()
print('Finetuned model')
print('Training set')
print(f'Top 1 accuracy: {finetuned_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {finetuned_validate_correct/len(validate_dataset)*100:.02f}')

----------using 4*GPUs----------


100%|██████████| 3296/3296 [03:51<00:00, 14.25it/s]
100%|██████████| 825/825 [00:57<00:00, 14.46it/s]
100%|██████████| 3296/3296 [03:51<00:00, 14.22it/s]
100%|██████████| 825/825 [00:57<00:00, 14.45it/s]


Based model
Training set
Top 1 accuracy: 96.94
Validation set
Top 1 accuracy: 95.64

Finetuned model
Training set
Top 1 accuracy: 99.97
Validation set
Top 1 accuracy: 99.88


In [5]:
n_negative = 10

dataset_name = f'reranker_training_dataset_with_Negative={n_negative}_train.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

train_dataset = []
with open(dataset_path) as file:
    train_dataset = list(file)
train_dataset = [json.loads(data) for data in train_dataset]

dataset_name = f'reranker_training_dataset_with_Negative={n_negative}_validate.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

validate_dataset = []
with open(dataset_path) as file:
    validate_dataset = list(file)
validate_dataset = [json.loads(data) for data in validate_dataset]

model_path = f'/project/lt200301-edubot/Capstone-TamTanai/models/bge-reranker-v2-m3-finetune-with_Negative={n_negative}'
reranker = FlagReranker(model_path, use_fp16=True)

based_train_correct = run_evaluate(based_reranker, train_dataset)
based_validate_correct = run_evaluate(based_reranker, validate_dataset)
finetuned_train_correct = run_evaluate(reranker, train_dataset)
finetuned_validate_correct = run_evaluate(reranker, validate_dataset)
del reranker

print('Based model')
print('Training set')
print(f'Top 1 accuracy: {based_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {based_validate_correct/len(validate_dataset)*100:.02f}')
print()
print('Finetuned model')
print('Training set')
print(f'Top 1 accuracy: {finetuned_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {finetuned_validate_correct/len(validate_dataset)*100:.02f}')

----------using 4*GPUs----------


100%|██████████| 3296/3296 [05:32<00:00,  9.93it/s]
100%|██████████| 825/825 [01:22<00:00,  9.95it/s]
100%|██████████| 3296/3296 [05:29<00:00, 10.00it/s]
100%|██████████| 825/825 [01:22<00:00,  9.95it/s]


Based model
Training set
Top 1 accuracy: 95.33
Validation set
Top 1 accuracy: 94.79

Finetuned model
Training set
Top 1 accuracy: 99.88
Validation set
Top 1 accuracy: 99.76


# Similar

## 5

In [10]:
n_negative = 5

dataset_name = f'reranker_training_dataset_with_similar={n_negative}_train.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

train_dataset = []
with open(dataset_path) as file:
    train_dataset = list(file)
train_dataset = [json.loads(data) for data in train_dataset]

dataset_name = f'reranker_training_dataset_with_similar={n_negative}_validate.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

validate_dataset = []
with open(dataset_path) as file:
    validate_dataset = list(file)
validate_dataset = [json.loads(data) for data in validate_dataset]

model_path = f'/project/lt200301-edubot/Capstone-TamTanai/models/bge-reranker-v2-m3-finetune-with_similar={n_negative}'
reranker = FlagReranker(model_path, use_fp16=True)

based_train_correct = run_evaluate(based_reranker, train_dataset)
based_validate_correct = run_evaluate(based_reranker, validate_dataset)
finetuned_train_correct = run_evaluate(reranker, train_dataset)
finetuned_validate_correct = run_evaluate(reranker, validate_dataset)
del reranker

print('Based model')
print('Training set')
print(f'Top 1 accuracy: {based_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {based_validate_correct/len(validate_dataset)*100:.02f}')
print()
print('Finetuned model')
print('Training set')
print(f'Top 1 accuracy: {finetuned_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {finetuned_validate_correct/len(validate_dataset)*100:.02f}')

----------using 4*GPUs----------


100%|██████████| 3296/3296 [04:11<00:00, 13.12it/s]
100%|██████████| 825/825 [01:00<00:00, 13.67it/s]
100%|██████████| 3296/3296 [04:34<00:00, 12.01it/s]
100%|██████████| 825/825 [01:10<00:00, 11.76it/s]


Based model
Training set
Top 1 accuracy: 70.57
Validation set
Top 1 accuracy: 73.45

Finetuned model
Training set
Top 1 accuracy: 99.76
Validation set
Top 1 accuracy: 89.21


### Other validation set

In [12]:
n_negative = 5

dataset_name = f'reranker_training_dataset_with_Negative=10_train.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

train_dataset = []
with open(dataset_path) as file:
    train_dataset = list(file)
train_dataset = [json.loads(data) for data in train_dataset]

dataset_name = f'reranker_training_dataset_with_Negative=10_validate.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

validate_dataset = []
with open(dataset_path) as file:
    validate_dataset = list(file)
validate_dataset = [json.loads(data) for data in validate_dataset]

model_path = f'/project/lt200301-edubot/Capstone-TamTanai/models/bge-reranker-v2-m3-finetune-with_similar={n_negative}'
reranker = FlagReranker(model_path, use_fp16=True)

finetuned_train_correct = run_evaluate(reranker, train_dataset)
finetuned_validate_correct = run_evaluate(reranker, validate_dataset)
del reranker

print('Finetuned model')
print('Training set')
print(f'Top 1 accuracy: {finetuned_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {finetuned_validate_correct/len(validate_dataset)*100:.02f}')

----------using 4*GPUs----------


100%|██████████| 3296/3296 [05:34<00:00,  9.85it/s]
100%|██████████| 825/825 [01:23<00:00,  9.91it/s]


Finetuned model
Training set
Top 1 accuracy: 98.36
Validation set
Top 1 accuracy: 93.70


# Similar + Keyword

## 5

In [5]:
n_negative = 5

dataset_name = f'reranker_training_dataset_with_similar={n_negative}_keyword={n_negative}_train.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

train_dataset = []
with open(dataset_path) as file:
    train_dataset = list(file)
train_dataset = [json.loads(data) for data in train_dataset]

dataset_name = f'reranker_training_dataset_with_similar={n_negative}_keyword={n_negative}_validate.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

validate_dataset = []
with open(dataset_path) as file:
    validate_dataset = list(file)
validate_dataset = [json.loads(data) for data in validate_dataset]

model_path = f'/project/lt200301-edubot/Capstone-TamTanai/models/bge-reranker-v2-m3-finetune-with_similar={n_negative}_keyword={n_negative}'
reranker = FlagReranker(model_path, use_fp16=True)

based_train_correct = run_evaluate(based_reranker, train_dataset)
based_validate_correct = run_evaluate(based_reranker, validate_dataset)
finetuned_train_correct = run_evaluate(reranker, train_dataset)
finetuned_validate_correct = run_evaluate(reranker, validate_dataset)
del reranker

print('Based model')
print('Training set')
print(f'Top 1 accuracy: {based_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {based_validate_correct/len(validate_dataset)*100:.02f}')
print()
print('Finetuned model')
print('Training set')
print(f'Top 1 accuracy: {finetuned_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {finetuned_validate_correct/len(validate_dataset)*100:.02f}')

----------using 4*GPUs----------


100%|██████████| 6513/6513 [08:10<00:00, 13.29it/s]
100%|██████████| 1620/1620 [01:59<00:00, 13.59it/s]
100%|██████████| 6513/6513 [07:54<00:00, 13.71it/s]
100%|██████████| 1620/1620 [01:58<00:00, 13.67it/s]


Based model
Training set
Top 1 accuracy: 80.38
Validation set
Top 1 accuracy: 81.98

Finetuned model
Training set
Top 1 accuracy: 99.92
Validation set
Top 1 accuracy: 92.22


## Reranking result

In [4]:
n_negative = 5

dataset_name = f'reranker_training_dataset_with_similar={n_negative}_keyword={n_negative}_2nd_train.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

train_dataset = []
with open(dataset_path) as file:
    train_dataset = list(file)
train_dataset = [json.loads(data) for data in train_dataset]

dataset_name = f'reranker_training_dataset_with_similar={n_negative}_keyword={n_negative}_2nd_validate.jsonl'
dataset_path = f'/project/lt200301-edubot/Capstone-TamTanai/reranker_training_dataset/{dataset_name}'

validate_dataset = []
with open(dataset_path) as file:
    validate_dataset = list(file)
validate_dataset = [json.loads(data) for data in validate_dataset]

model_path = f'/project/lt200301-edubot/Capstone-TamTanai/models/bge-reranker-v2-m3-finetune-with_similar={n_negative}_keyword={n_negative}_2nd'
reranker = FlagReranker(model_path, use_fp16=True)

based_train_correct = run_evaluate(based_reranker, train_dataset)
based_validate_correct = run_evaluate(based_reranker, validate_dataset)
finetuned_train_correct = run_evaluate(reranker, train_dataset)
finetuned_validate_correct = run_evaluate(reranker, validate_dataset)
del reranker

print('Based model')
print('Training set')
print(f'Top 1 accuracy: {based_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {based_validate_correct/len(validate_dataset)*100:.02f}')
print()
print('Finetuned model')
print('Training set')
print(f'Top 1 accuracy: {finetuned_train_correct/len(train_dataset)*100:.02f}')
print('Validation set')
print(f'Top 1 accuracy: {finetuned_validate_correct/len(validate_dataset)*100:.02f}')

----------using 4*GPUs----------


100%|██████████| 9809/9809 [12:31<00:00, 13.06it/s]
100%|██████████| 2445/2445 [02:54<00:00, 13.97it/s]
100%|██████████| 9809/9809 [11:42<00:00, 13.97it/s]
100%|██████████| 2445/2445 [02:55<00:00, 13.93it/s]


Based model
Training set
Top 1 accuracy: 77.64
Validation set
Top 1 accuracy: 79.30

Finetuned model
Training set
Top 1 accuracy: 99.93
Validation set
Top 1 accuracy: 91.37
