<a href="https://colab.research.google.com/github/webbigdata-jp/python_sample/blob/main/ALMA_7B_Ja_V2_GPTQ_Ja_En_batch_translation_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [ALMA-7B-Ja-V2-GPTQ-Ja-En](https://huggingface.co/webbigdata/ALMA-7B-Ja-V2-GPTQ-Ja-En) batch translaion sample.

アップロードされたファイルを英語から日本語、または日本語から英語に一括翻訳し、ファイルとして出力します。  
Translate uploaded file from English to Japanese or from Japanese to English in bulk and output it as file.  


以下は既知の問題です。  
Below are the known issues  

- 長い文章を入力するとエラーになります。
- 意味のない文章や日本語でない文章を入力すると、出力がおかしくなることがあります。

- If you give a long sentence, an error will occur.(This is a limitation of free Colab)
- If you provide meaningless sentences or sentences that are not Japanese, the output may become strange.

## (1)Install required libraries

In [None]:
%%capture
%%shell
#@title Install auto-GPTQ
pip install auto-gptq==0.4.2

## (2)Setting Up

In [None]:
#@title Upload Text File(.txt only)
import os
from google.colab import files
import shutil

uploaded = files.upload()

In [None]:
#@title Translation Setting
Translation_direction = 'Japanese to English' #@param ["Japanese to English", "English to Japanese"]

In [None]:
%%capture
#@title Download Model (may take a few minutes)
import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import re


def contains_japanese(text):
    # 日本語の文字範囲を確認するための正規表現パターン
    # 平仮名: 3040-309F, 片仮名: 30A0-30FF, 漢字: 4E00-9FAF (旧字体、新字体)
    pattern = re.compile('[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]')
    return re.search(pattern, text) is not None


def translate_text(prompt, model, tokenizer, Translation_direction):
    #print(prompt)
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=512, truncation=True).input_ids.cuda()
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, num_beams=3, max_new_tokens=512, do_sample=True, temperature=0.4, top_p=0.9)

    full_output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Extract the part after "English:"
    if Translation_direction == 'Japanese to English':
      translated_output = full_output.split("English:")[-1].strip()
    else:
      translated_output = full_output.split("Japanese:")[-1].strip()
    return translated_output

quantized_model_dir = "webbigdata/ALMA-7B-Ja-V2-GPTQ-Ja-En"
model_basename = "gptq_model-4bit-128g"

tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
model = AutoGPTQForCausalLM.from_quantized(
        quantized_model_dir,
        model_basename=model_basename,
        use_safetensors=True,
        device="cuda:0")


## (3)Do translation

In [None]:


#@title Translate line by line and write to file
import chardet  # Required for character encoding detection

for filename in uploaded.keys():
    translated_sentences = []

    with open(filename, "rb") as file:  # Read file in binary mode
        binary_content = file.read()
        detected_encoding = chardet.detect(binary_content)["encoding"] or "sjis"
        content = binary_content.decode(detected_encoding).encode("utf-8").decode("utf-8")  # Convert to utf-8

        if Translation_direction == 'Japanese to English':
            if contains_japanese(content):
              sentences = [s for s in content.split('。') if s]
              sentences = [item for sublist in [s.split('\n') for s in sentences] for item in sublist]
            else:
              print(content)
              sentences = [content]
        else:
            sentences = [s for s in content.split('.') if s]
            sentences = [item for sublist in [s.split('\n') for s in sentences] for item in sublist]

    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) > 0:
            if Translation_direction == 'Japanese to English':
                if contains_japanese(content):
                    ja_prompt = f"Translate this from Japanese to English:\nJapanese: {sentence}。\nEnglish:"
                    translated_sentences.append(translate_text(ja_prompt, model, tokenizer, Translation_direction))
                else:
                    translated_sentences.append(sentence)
            else:
                en_prompt = f"Translate this from English to Japanese:\nEnglish: {sentence}.\nJapanese:"
                translated_sentences.append(translate_text(en_prompt, model, tokenizer, Translation_direction))
        else:
          translated_sentences.append("")

    output_filename = filename.replace('.txt', '_Ja_to_En.txt') if Translation_direction == 'Japanese to English' else filename.replace('.txt', '_En_to_Ja.txt')
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write('\n'.join(translated_sentences))

    print(f"Translation compleated. please download files.: {output_filename}")
    files.download(output_filename)
