In [1]:
from datasets import load_dataset, Value
import transformers
from transformers import AutoTokenizer, AutoModel
import torch
import argparse
import json
from tqdm import tqdm
import os
import numpy as np
import faiss
from multiprocessing import Process, Queue

In [9]:
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

def encode(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model(**inputs)
    return np.array(mean_pooling(outputs.last_hidden_state, inputs['attention_mask']).detach().cpu())

In [2]:
os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3'

In [3]:
nq_corpus = load_dataset('BeIR/nq','corpus')['corpus']
nq_corpus.to_json('data/nq_corpus.jsonl', orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/2682 [00:00<?, ?ba/s]

1447661927

In [4]:
dataset = load_dataset('json', data_files='data/nq_corpus.jsonl')["train"]
data = [t if t is not None else '' for t in dataset['text']]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
def process_chunk(gpu_id, chunk, tokenizer_path, model_path, queue, batch_size):
    # Assign GPU
    torch.cuda.set_device(gpu_id)
    device = f'cuda:{gpu_id}'

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model = AutoModel.from_pretrained(model_path).to(device)
    model.eval()

    # Create a local FAISS index
    index = faiss.IndexFlatIP(768)

    with torch.no_grad():
        for i in tqdm(range(0, len(chunk), batch_size), desc=f"GPU {gpu_id}"):
            batch = chunk[i:i + batch_size]
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
            outputs = model(**inputs)
            batch_embeddings = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
            index.add(np.array(batch_embeddings.cpu()))

    queue.put((gpu_id,index))

def merge_indexes(index_list, save_path):
    # Merge multiple FAISS indexes
    merged_index = faiss.IndexFlatIP(768)
    for idx in index_list:
        merged_index.merge_from(idx)
    faiss.write_index(merged_index, save_path)

In [6]:

dataset_path = 'data/nq_corpus.jsonl'
tokenizer_path = 'facebook/contriever'
model_path = 'facebook/contriever'
save_path = 'data/nq_corpus.index'
batch_size = 32
num_gpus = torch.cuda.device_count()
num_processes = num_gpus
# Split data into chunks
chunk_size = len(data) // num_processes
chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]

# Multiprocessing setup
processes = []
queue = Queue()

for gpu_id, chunk in enumerate(chunks[:num_gpus]):
    p = Process(target=process_chunk, args=(gpu_id, chunk, tokenizer_path, model_path, queue,batch_size))
    processes.append(p)
    p.start()

# Collect results
index_list = []
for _ in processes:
    index_list.append(queue.get())

# Wait for all processes to finish
for p in processes:
    p.join()

# Merge indexes and save
a = sorted(index_list,key= lambda x: x[0])
a = [t[1] for t in a]
merge_indexes(a, save_path)
print(f"Index saved at {save_path}")

GPU 3: 100%|██████████| 20949/20949 [44:30<00:00,  7.84it/s] 
GPU 1: 100%|██████████| 20949/20949 [45:22<00:00,  7.69it/s]
GPU 2: 100%|██████████| 20949/20949 [46:12<00:00,  7.56it/s]
GPU 0: 100%|██████████| 20949/20949 [47:40<00:00,  7.32it/s]


TypeError: sorted expected 1 argument, got 2

In [7]:

a = sorted(index_list,key= lambda x: x[0])
a = [t[1] for t in a]
merge_indexes(a, save_path)
print(f"Index saved at {save_path}")

Index saved at data/nq_corpus.index


In [2]:
dataset = load_dataset('BeIR/fiqa','queries')['queries']
dataset_qrels = load_dataset('BeIR/fiqa-qrels')['train']
dataset_qrels= dataset_qrels.cast_column('query-id',Value('string')).cast_column('corpus-id',Value('string'))

In [3]:
# dataset_corpus = load_dataset('json', data_files='data/fiqa_corpus.jsonl')["train"]
dataset_corpus = load_dataset('BeIR/fiqa','corpus')['corpus']
corpus_dict = {row['_id']:row['text'] for row in dataset_corpus}

In [4]:
qrels_dict = {}
qrels_passage_dict = {}
for row in dataset_qrels:
    qrels_dict[row['query-id']] = qrels_dict.get(row['query-id'],[])
    qrels_dict[row['query-id']].append(row['corpus-id'])

In [29]:
index = faiss.read_index('data/fiqa_corpus.index')

def search(query_vectors, query_id, top_k):
    # print('searching')
    scores, indices = index.search(query_vectors, top_k)
    # print(indices)
    # print('search ended')
    ret = []
    for s, i in zip(scores[0],indices[0]):
        t = dataset_corpus[int(i)]
        ret.append((t['_id'], t['_id'] in qrels_dict[query_id],t['text'],s))
    return ret

In [6]:
os.environ['CUDA_VISIBLE_DEVICES']='0'
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
model = AutoModel.from_pretrained('facebook/contriever').to('cuda')



In [52]:
print(dataset[10])

{'_id': '16', 'title': '', 'text': 'Business Investment Loss from prior year'}


In [7]:
res = faiss.StandardGpuResources()  # Initialize GPU resources
index = faiss.index_cpu_to_gpu(res, 0, index)  # Move to GPU 0

In [47]:
idx = 1
id = dataset[idx]['_id']
text = dataset[idx]['text']

print('[Original Question]',text)
# text = 'Taxation rules for online drop-shipping business in India and income tax requirements'
tnw = [(text,1.0)]
# tnw = [('definition of business expense',1.0),('business trip',0.7),('expense deduction',-0.3)]
for i, t in enumerate(tnw):
    print(f'[Query {i}][Weight : {t[1]}]',t[0])
for t in qrels_dict[id]:
    print('[GOLD]',corpus_dict[t])
print()
rss = {}
for t in tnw:
    inputs = tokenizer(t[0], return_tensors="pt", padding=True, truncation=True).to('cuda')
    outputs = model(**inputs)
    embeddings = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
    rs = search(embeddings.detach().to('cpu'),id,index.ntotal)
    for a in rs:
        rss[a[0]]=rss.get(a[0],[])
        rss[a[0]].append((a[3],a[2],a[1]))
rss2 = {}

for k,v in rss.items():
    rss2[k]=0
    for i in range(len(v)):
        # print(v[i])
        rss2[k]+=v[i][0]*tnw[i][1]

sorted_keys = sorted(rss2.keys(), key=lambda k: -rss2[k])

for t in sorted_keys[:5]:
    print(('[RELEVANT] ' if rss[t][0][2] else '[NON-RELEVANT] ')+rss[t][0][1])


[Original Question] Business Expense - Car Insurance Deductible For Accident That Occurred During a Business Trip
[Query 0][Weight : 1.0] Business Expense - Car Insurance Deductible For Accident That Occurred During a Business Trip
[GOLD] As a general rule, you must choose between a mileage deduction or an actual expenses deduction.  The idea is that the mileage deduction is supposed to cover all costs of using the car.  Exceptions include parking fees and tolls, which can be deducted separately under either method.  You explicitly cannot deduct insurance costs if you claim a mileage deduction.   Separately, you probably won't be able to deduct the deductible for your car as a casualty loss.  You first subtract $100 from the deductible and then divide it by your Adjusted Gross Income (AGI) from your tax return.  If your deductible is over 10% of your AGI, you can deduct it.   Note that even with a $1500 deductible, you won't be able to deduct anything if you made more than $14,000 for 

In [81]:
print(qrels_dict.keys())

dict_keys([0, 4, 5, 6, 7, 9, 11, 12, 13, 14, 16, 19, 20, 21, 23, 25, 27, 28, 30, 31, 32, 33, 35, 36, 37, 41, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 57, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 74, 75, 76, 78, 80, 81, 82, 83, 85, 86, 87, 88, 91, 92, 93, 95, 96, 97, 99, 102, 105, 107, 108, 110, 111, 112, 439, 442, 443, 444, 445, 446, 447, 448, 449, 450, 452, 453, 454, 456, 457, 458, 460, 461, 462, 464, 465, 466, 467, 468, 470, 471, 472, 473, 474, 478, 479, 480, 481, 482, 485, 486, 487, 488, 489, 490, 491, 493, 494, 497, 498, 499, 500, 501, 502, 506, 507, 508, 509, 510, 511, 512, 513, 517, 518, 520, 521, 522, 523, 524, 525, 526, 527, 528, 531, 532, 533, 535, 536, 537, 538, 539, 540, 541, 542, 544, 545, 546, 550, 552, 553, 554, 555, 556, 557, 558, 562, 563, 564, 565, 567, 568, 569, 571, 572, 573, 574, 575, 576, 577, 578, 579, 581, 582, 583, 587, 589, 590, 591, 592, 593, 595, 596, 597, 598, 601, 602, 605, 606, 607, 608, 609, 610, 613, 614, 615, 617, 619, 621, 623, 624, 625, 626, 627

In [40]:
idx = 70
id = dataset[idx]['_id']
text = dataset[idx]['text']

print('[Original Question]',text)
# text = 'Taxation rules for online drop-shipping business in India and income tax requirements'
tnw = [(text,1.0)]
tnw = [("Income tax for eCommerce and drop-shipping businesses in India", 1.0), ("Taxation rules for online drop-shipping in India", 0.9), ("How are eCommerce and drop-shipping profits taxed in India", 0.8)]
for i, t in enumerate(tnw):
    print(f'[Query {i}][Weight : {t[1]}]',t[0])
print()
rss = {}
for t in tnw:
    inputs = tokenizer(t[0], return_tensors="pt", padding=True, truncation=True).to('cuda')
    outputs = model(**inputs)
    embeddings = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
    rs = search(embeddings.detach().to('cpu'),id,index.ntotal)
    for a in rs:
        rss[a[0]]=rss.get(a[0],[])
        rss[a[0]].append((a[3],a[2],a[1]))
rss2 = {}

for k,v in rss.items():
    rss2[k]=0
    for i in range(len(v)):
        # print(v[i])
        rss2[k]+=v[i][0]*tnw[i][1]

sorted_keys = sorted(rss2.keys(), key=lambda k: -rss2[k])

print('[Retrieved Passages]')
for i, t in enumerate(sorted_keys[:5]):
    print(f'<Passage {i}> '+rss[t][0][1])

[Original Question] Income tax on my online drop-shipping business (India)
[Query 0][Weight : 1.0] Income tax for eCommerce and drop-shipping businesses in India
[Query 1][Weight : 0.9] Taxation rules for online drop-shipping in India
[Query 2][Weight : 0.8] How are eCommerce and drop-shipping profits taxed in India

[Retrieved Passages]
<Passage 0> There are no clear guidelines. If you are selling as individual, then what ever profit you make gets added to your overall income as you pay tax accordingly. This is true for sole proprietor or partnership kind of firms. If you are registered as a Company, the profits are taxed as business income. There may be VAT and other taxes. Please consult a CA who can guide you in specifics as for eCommerce, there is no defined law and one has to interpret various other tax laws.
<Passage 1> Tax Deducted at source is applicable to Employee / Employer [contract employee] relations ... it was also made applicable for cases where an Indian company pays 

In [103]:
idx = 70
id = dataset[idx]['_id']
text = dataset[idx]['text']

print('[Original Question]',text)
text = 'Income tax implications for running a drop-shipping business in India'
print('[Query]',text)
print()
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to('cuda')
outputs = model(**inputs)
embeddings = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
rs = search(embeddings.detach().to('cpu'),id,5)
for i, t in enumerate(rs):
    print(f'<Passage {i}> '+t[1])


[Original Question] Income tax on my online drop-shipping business (India)
[Query] Income tax implications for running a drop-shipping business in India

<Passage 0> This is a complicated question that relies on the US-India Tax Treaty to determine whether the income is taxable to the US or to India.  The relevant provision is likely Article 15 on Personal Services. http://www.irs.gov/pub/irs-trty/india.pdf It seems plausible that your business is personal services, but that's a fact-driven question based on your business model.  If the online training is 'personal services' provided by you from India, then it is likely foreign source income under the treaty.  The 'fixed base' and '90 days' provisions in Article 15 would not apply to an India resident working solely outside the US. The question is whether your US LLC was a US taxpayer.  If the LLC was a taxpayer, then it has an obligation to pay US tax on any worldwide income and it also arguably disqualifies you from Article 15 (which

In [23]:
print(index.ntotal)

57638


In [44]:
print(len(dataset_corpus))

2681468


In [11]:
del index