# Environment Setting

In [None]:
! pip install sacrebleu sentencepiece
! pip install datasets transformers

# Zero/Few-Shot Prompting for Translation

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "ai4bharat/Airavata"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [2]:
model.cuda()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(48065, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_e

In [3]:
import datasets
train_data = datasets.load_dataset('ai4bharat/indic-instruct-data-v0.1', 'nmt-seed')
valid_en_data = datasets.load_dataset('facebook/flores', 'eng_Latn', trust_remote_code=True)
valid_hi_data = datasets.load_dataset('facebook/flores', 'hin_Deva', trust_remote_code=True)

In [4]:
from tqdm import tqdm

def gen_prompt(texts, shots=[]):
    shots = ''.join([
        f"English: {src}\nHindi: {trg}\n\n"
        for src, trg in shots
    ])
    prompts = []
    for text in texts:
        prompt = f"Translate the following sentence(s) from English into Hindi.\n\n"
        prompt += shots
        prompt += f"English: {text}"
        prompt += f"\nHindi:"
        prompts.append(prompt)
    return prompts

def generate(model, tokenizer, prompts, batch_size=10):
    prompts = gen_prompt(valid_en_data['devtest']['sentence'])
    outputs = []

    for i in tqdm(range(0, len(prompts), batch_size)):
        batch_prompts = prompts[i:min(i+batch_size, len(prompts))]
        input = tokenizer(
            batch_prompts,
            padding="longest",
            return_tensors="pt",
            add_special_tokens=True
        )
        input.to(model.device)

        batch_outputs = model.generate(**input)
        batch_outputs = tokenizer.batch_decode(batch_outputs, skip_special_tokens=True)
        outputs += [
            output[len(prompt)+1:]
            for prompt, output in zip(batch_prompts, batch_outputs)
        ]
    return outputs

In [5]:
zero_shots_prompts = gen_prompt(
    valid_en_data['devtest']['sentence'],
)

print(zero_shots_prompts[0])

zero_shots_outputs = generate(model, tokenizer, zero_shots_prompts)

Translate the following sentence(s) from English into Hindi.

English: "We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.
Hindi:


  0%|          | 0/102 [00:00<?, ?it/s]

100%|██████████| 102/102 [05:10<00:00,  3.05s/it]


In [8]:
five_shots_prompts = gen_prompt(
    valid_en_data['devtest']['sentence'],
    shots=list(zip(train_data['hi']['input_text'], train_data['hi']['output_text']))[:5]
)

print(five_shots_prompts[0])

five_shots_outputs = generate(model, tokenizer, five_shots_prompts)

Translate the following sentence(s) from English into Hindi.

English: The winner is announced at an event in Sydney in March.
Hindi: मार्च में सिडनी में एक कार्यक्रम में विजेता की घोषणा की जाती है।

English: It is in charge of music licenses and royalties.
Hindi: इस पर म्यूजिक लाइसेंस और रॉयल्टी की जिम्मेदारी है।

English: After his brother dies, Jake replaces him to control his avatar.
Hindi: उसके भाई के मरने के बाद, जेक उसके अवतार को नियंत्रित करने के लिए उसकी जगह लेता है।

English: They go to a nearby base to change into avatar mode.
Hindi: अवतार रूप में बदलने के लिए वे पास के संचालन केंद्र पर जाते हैं।

English: Avataro Sentai Donbrothers  Avataro Sentai Donbrothers is a Japanese television series.
Hindi: अवतार सेंटाई डॉनब्रदर्स: अवतार सेंटाई डॉनब्रदर्स एक जापानी टेलीविजन सीरीज है।

English: "We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.
Hindi:


100%|██████████| 102/102 [04:52<00:00,  2.87s/it]


In [10]:
import sacrebleu

def evaluate(outputs, refs):
    print(sacrebleu.corpus_bleu(outputs, refs))
    print(sacrebleu.corpus_chrf(outputs, refs))
    print(sacrebleu.corpus_ter(outputs, refs))

evaluate(zero_shots_outputs, [valid_hi_data['devtest']['sentence']])
evaluate(five_shots_outputs, [valid_hi_data['devtest']['sentence']])

BLEU = 33.23 62.5/40.2/27.6/19.2 (BP = 0.978 ratio = 0.978 hyp_len = 27144 ref_len = 27743)
chrF2 = 58.44
TER = 52.58
BLEU = 33.24 62.5/40.2/27.6/19.3 (BP = 0.978 ratio = 0.979 hyp_len = 27150 ref_len = 27743)
chrF2 = 58.44
TER = 52.60
