In [67]:
!pip install tiktoken oaib  google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.5.4-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.4 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.4-py3-none-any.whl.metadata (5.6 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.19.0-py3-none-any.whl.metadata (2.7 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.131.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.29.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.4->google-generativeai)
  Downloading proto_plus-1.23.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pyasn1-modules>=0.2.1 (from google-auth>=2.15.0->google-generativeai)
  Downloading pyasn1_modules-0.4.0-py3-none-any.whl.metadata (3.4 kB)
Collectin

In [59]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "sk-something"
client = OpenAI(api_key="sk-something")


In [26]:
from openicl import DatasetReader, PromptTemplate, GenInferencer
from datasets import Dataset
import pandas as pd

In [157]:
LANG_NAMES = {
    "en": "English",
    "ban": "Balinese"
}
PROMPT_TEMPLATE = "Translate this from {src_lang} to {tgt_lang}:\n{src_lang}: {input_text}\n{tgt_lang}:"

In [152]:
def gen_df(src_lang, tgt_lang):
    with open(f"flores-eval/{src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{src_lang}") as src_file:
        with open(f"flores-eval/{src_lang}{tgt_lang}/test.{src_lang}-{tgt_lang}.{tgt_lang}") as tgt_file:
            src_lines = [l.replace('\n', '') for l in src_file.readlines()]
            tgt_lines = [l.replace('\n', '') for l in tgt_file.readlines()]
            df = pd.DataFrame({"src": src_lines, "tgt": tgt_lines})

    df["src_lang"] = LANG_NAMES[src_lang]
    df["tgt_lang"] = LANG_NAMES[tgt_lang]
    df["prompt"] = df.apply(lambda x: PROMPT_TEMPLATE.format(src_lang=x["src_lang"], tgt_lang=x["tgt_lang"], input_text=x["src"]), axis=1)

    return df

In [158]:
df = gen_df("en", "ban")

In [15]:
hg_dataset = Dataset.from_pandas(df)
dataset = DatasetReader(hg_dataset, input_columns=['src', "src_lang", "tgt_lang"], output_column="tgt")
template = PromptTemplate('</E>Translate this from </src_lang> to </tgt_lang>:\n</src_lang>:</src>\n<tgt_lang>:', {'src' : '</src>', 'src_lang' : '</src_lang>', 'tgt_lang': '</tgt_lang>'}, ice_token='</E>')

In [51]:
# GPT 3 & 4 batch generation

import json

def create_openai_batch(filename, model_name):
    with open(filename, 'w') as f:
            for i,row in df.iterrows():
                f.write(json.dumps({
                  "custom_id": f"batch-clean-{i}",
                  "method": "POST",
                  "url": "/v1/chat/completions",
                  "body": {
                    "model": model_name,
                    "messages": [
                        {
                          "role": "system",
                          "content": "You are a helpful translator."
                        },
                        {
                          "role": "user",
                          "content": row["prompt"]
                        }
                    ],
                    "max_tokens": 256
                  }
                })+"\n")

In [53]:
def run_batch(filename):
    batch_input_file = client.files.create(
      file=open(filename, "rb"),
      purpose="batch"
    )
    batch_input_file_id = batch_input_file.id

    client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
          "description": "batch clean job"
        }
    )

In [159]:
create_openai_batch("openai/enban/gpt3.5-flores.jsonl", "gpt-3.5-turbo-0125")
create_openai_batch("openai/enban/gpt4o-flores.jsonl", "gpt-4o")
# create_openai_batch("openai/gpt4-flores.jsonl", "gpt-4-turbo")

In [160]:
run_batch("openai/enban/gpt3.5-flores.jsonl")
run_batch("openai/enban/gpt4o-flores.jsonl")
# run_batch("openai/enban/gpt4-flores.jsonl")

In [78]:
import google.generativeai as genai
import asyncio

GOOGLE_API_KEY="xyz"
genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")

In [81]:
async def generate_gemini(text: str) -> str:
    r = await model.generate_content_async(text)
    return r.text.replace("\n", "").strip()

In [84]:
jobs = asyncio.gather(*[generate_gemini(prompt) for prompt in df.prompt])
results = await jobs

ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

In [75]:
%%time

response = model.generate_content(df.iloc[0].prompt)

CPU times: user 9.01 ms, sys: 89 µs, total: 9.1 ms
Wall time: 1.24 s


In [77]:
response.text.replace("\n", "").strip()

'On Monday, scientists from Stanford University of Medicine announced a new diagnostic discovery tool that uses cell sorting: a small, easily printable chip that is made using a standard inkjet printer and costs less than a U.S. cent.'

In [169]:
gpt3_content = client.files.content("file-BeH0uYrloVtPLA93Rt3yW8ON")
gpt4o_content = client.files.content("file-Ec3aCe92f01U8m9poOT6RlB0")

In [122]:
from sacrebleu.metrics import BLEU

bleu = BLEU()
bleu.corpus_score(text, df.loc[i].tgt) # doesn't work

In [170]:
for line in gpt3_content.text.split("\n"):
    if line == "":
        break
    obj = json.loads(line)
    i = int(obj["custom_id"].split("batch-clean-")[1])
    text = obj["response"]["body"]["choices"][0]["message"]["content"].replace("\n", "").strip()
    df.at[i, "response"] = text
    # df.at[i

In [172]:
with open('openai/enban/gpt3-response.ban', 'w') as f:
    f.write('\n'.join(df.response))

In [173]:
!sacrebleu -tok 13a -w 2 flores-eval/enban/test.en-ban.ban < openai/enban/gpt3-response.ban > openai/enban/gpt3-score.bleu
# !sacrebleu -tok 13a -w 2 flores-eval/enban/test.en-ban.ban < openai/enban/gpt4-response.ban > openai/enban/gpt4-score.bleu

In [174]:
!cat openai/enban/gpt3-score.bleu

{
 "name": "BLEU",
 "score": 7.2,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.2",
 "verbose_score": "30.9/10.1/4.3/2.0 (BP = 1.000 ratio = 1.029 hyp_len = 46507 ref_len = 45197)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.2"
}
