In [29]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd

model_name = "/Users/tlxy/Research/model/Helsinki-NLP/"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


Use model:  [`Helsinki-NLP/opus-mt-tc-bible-big-mul-mul`](https://huggingface.co/Helsinki-NLP/opus-mt-tc-bible-big-mul-mul)

1. Clone the model using `GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Helsinki-NLP/opus-mt-tc-bible-big-mul-mul /Users/tlxy/Research/model/Helsinki-NLP`
2. Load the model locally. 

This post may help: https://mp.weixin.qq.com/s/Egm4fmpENY2FTDCzJQqy3w?token=21718324&lang=zh_CN

In [30]:
def translate_column(df, column_name, target_column_name):
    """
    Translates a column in a DataFrame from various languages to English.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the column to translate.
        target_column_name (str): The name of the new column for translations.
    
    Returns:
        pd.DataFrame: The DataFrame with an additional column containing translations.
    """
    # Add language prefix for each sentence
    prefixed_texts = [f">>eng<< {sentence}" for sentence in df[column_name]]

    # Tokenize input and generate translations
    encoded_input = tokenizer(prefixed_texts, return_tensors="pt", padding=True, truncation=True)
    translated_tokens = model.generate(**encoded_input)

    # Decode translations
    translations = [tokenizer.decode(tokens, skip_special_tokens=True) for tokens in translated_tokens]

    # Add translations to a new column
    df[target_column_name] = translations
    return df


In [38]:
# Create a DataFrame with 20 rows of language names and sentences in different languages
data = {
    'lg': [
        'German', 'Russian', 'Japanese', 'Spanish', 'French',
        'Italian', 'Chinese', 'Korean', 'Portuguese', 'Arabic',
        'Hindi', 'Dutch', 'Swedish', 'Turkish', 'Greek',
        'Polish', 'Czech', 'Finnish', 'Danish', 'Norwegian'
    ],
    'title': [
        'Jedes Mädchen, das ich sehe, gefällt mir.',  # German
        'Каждая девушка, которую я вижу, мне нравится.',  # Russian
        '見る女の子はみんな好きです。',  # Japanese
        'Cada chica que veo me gusta.',  # Spanish
        'Chaque fille que je vois me plaît.',  # French
        'Ogni ragazza che vedo mi piace.',  # Italian
        '我看到的每个女孩我都喜欢。',  # Chinese
        '내가 보는 모든 소녀가 마음에 든다.',  # Korean
        'Cada menina que vejo, eu gosto.',  # Portuguese
        'كل فتاة أراها تعجبني.',  # Arabic
        'मैं जो भी लड़की देखता हूं, मुझे पसंद है।',  # Hindi
        'Elk meisje dat ik zie, vind ik leuk.',  # Dutch
        'Varje flicka jag ser gillar jag.',  # Swedish
        'Gördüğüm her kızı beğeniyorum.',  # Turkish
        'Κάθε κορίτσι που βλέπω μου αρέσει.',  # Greek
        'Każda dziewczyna, którą widzę, podoba mi się.',  # Polish
        'Každá dívka, kterou vidím, se mi líbí.',  # Czech
        'Jokainen tyttö, jonka näen, miellyttää minua.',  # Finnish
        'Hver pige, jeg ser, kan jeg lide.',  # Danish
        'Hver jente jeg ser, liker jeg.'  # Norwegian
    ]
}

# Create the DataFrame
sample_df = pd.DataFrame(data)
sample_df

Unnamed: 0,lg,title
0,German,"Jedes Mädchen, das ich sehe, gefällt mir."
1,Russian,"Каждая девушка, которую я вижу, мне нравится."
2,Japanese,見る女の子はみんな好きです。
3,Spanish,Cada chica que veo me gusta.
4,French,Chaque fille que je vois me plaît.
5,Italian,Ogni ragazza che vedo mi piace.
6,Chinese,我看到的每个女孩我都喜欢。
7,Korean,내가 보는 모든 소녀가 마음에 든다.
8,Portuguese,"Cada menina que vejo, eu gosto."
9,Arabic,كل فتاة أراها تعجبني.


In [39]:
# Translate 'title' column to English and add to 'en_title'
df_translated = translate_column(sample_df, 'title', 'en_title')

# Display the resulting DataFrame
df_translated

Unnamed: 0,lg,title,en_title
0,German,"Jedes Mädchen, das ich sehe, gefällt mir.",I like every girl I see.
1,Russian,"Каждая девушка, которую я вижу, мне нравится.",I like every girl I see.
2,Japanese,見る女の子はみんな好きです。,They like all the girls to see.
3,Spanish,Cada chica que veo me gusta.,Every girl I see likes.
4,French,Chaque fille que je vois me plaît.,Every girl I see likes.
5,Italian,Ogni ragazza che vedo mi piace.,Every girl I see likes.
6,Chinese,我看到的每个女孩我都喜欢。,I'm happy with the girls I saw.
7,Korean,내가 보는 모든 소녀가 마음에 든다.,"And I saw the star, and it rejoiced with great..."
8,Portuguese,"Cada menina que vejo, eu gosto.","Every girl I see, I like."
9,Arabic,كل فتاة أراها تعجبني.,Every girl I see likes me.


In [36]:
import pandas as pd

# Create a DataFrame with 20 rows of unique German sentences
data = {
    'lg': ['German'] * 20,
    'title': [
        'Jedes Mädchen, das ich sehe, gefällt mir.',
        'Wie geht es dir?',
        'Das ist ein schönes Buch.',
        'Ich liebe den Sommer.',
        'Der Himmel ist heute blau.',
        'Ich habe Hunger.',
        'Das Auto ist sehr schnell.',
        'Ich mag Schokolade.',
        'Das Wetter ist wunderbar.',
        'Ich gehe gerne spazieren.',
        'Die Blumen sind schön.',
        'Ich trinke gerne Kaffee.',
        'Das Haus ist groß.',
        'Ich höre gerne Musik.',
        'Der Hund ist freundlich.',
        'Ich lese ein interessantes Buch.',
        'Die Kinder spielen im Park.',
        'Ich koche gerne Pasta.',
        'Der Zug ist pünktlich.',
        'Ich lerne Deutsch.'
    ]
}

# Create the DataFrame
sample_df = pd.DataFrame(data)

# Translate 'title' column to English and add to 'en_title'
df_translated = translate_column(sample_df, 'title', 'en_title')

# Display the resulting DataFrame
df_translated

Unnamed: 0,lg,title,en_title
0,German,"Jedes Mädchen, das ich sehe, gefällt mir.",I like every girl I see.
1,German,Wie geht es dir?,How are you doing?
2,German,Das ist ein schönes Buch.,This is a beautiful book.
3,German,Ich liebe den Sommer.,I love the summer.
4,German,Der Himmel ist heute blau.,The sky is blue today.
5,German,Ich habe Hunger.,I'm hungry.
6,German,Das Auto ist sehr schnell.,The car is very fast.
7,German,Ich mag Schokolade.,I like chocolate.
8,German,Das Wetter ist wunderbar.,The weather is wonderful.
9,German,Ich gehe gerne spazieren.,I like to walk.
