In [1]:
import fasttext
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
language_detector = fasttext.load_model(model_path)
language_detector.predict("Hello, world!")

model.bin:   0%|          | 0.00/1.18G [00:00<?, ?B/s]



(('__label__eng_Latn',), array([0.61224753]))

In [16]:
language_detector.predict("ich suchen ein restaurant mit moderaten preisen ."), language_detector.predict("ciao , sto cercare un ristorante con prezzo onesto .")

((('__label__deu_Latn',), array([0.99636233])),
 (('__label__ita_Latn',), array([0.99764943])))

In [2]:
from datasets import load_dataset
sharegpt_gpt4_dataset = load_dataset('shibing624/sharegpt_gpt4')

In [12]:
lang, prob = language_detector.predict(str(sharegpt_gpt4_dataset['train']['conversations'][0]))

In [13]:
lang, prob

(('__label__eng_Latn',), array([0.99958223]))

In [38]:
from tqdm.notebook import tqdm
def extract_conversation_list_given_language(data_list=sharegpt_gpt4_dataset['train']['conversations'], lang_list=['__label__eng_Latn', '__label__deu_Latn', '__label__ita_Latn'], threshold=0.99):
    data = {k:[] for k in lang_list}
    print(len(data_list))
    for c in tqdm(data_list):
        lang, prob = language_detector.predict(str(c)) # ('__label__eng_Latn',) [0.99958223]
        lang, prob = lang[0], prob[0]
        # print(lang, prob)
        if lang in lang_list and prob >= threshold:
            data[lang].append(c)
    return data

In [39]:
data = extract_conversation_list_given_language()

103415


  0%|          | 0/103415 [00:00<?, ?it/s]

In [40]:
for k, v in data.items():
    print(k, len(v), '\nExample', v[0])

__label__eng_Latn 46006 
Example [{'from': 'human', 'value': "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..."}, {'from': 'gpt', 'value': "Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:\n\n1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.\n2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.\n3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.\n4. Post-launch: Follow up with customers, gather feedback, and continue to provide

In [2]:
from datasets import load_dataset
woz_dialogue_en, woz_dialogue_de, woz_dialogue_it = load_dataset('woz_dialogue', 'en'), load_dataset('woz_dialogue', 'de'), load_dataset('woz_dialogue', 'it')

In [3]:
d = woz_dialogue_en['train'][0]['dialogue']
d

[{'turn_label': [['food', 'eritrean']],
  'asr': [['Are there any eritrean restaurants in town?']],
  'system_transcript': '',
  'turn_idx': 0,
  'belief_state': [{'slots': [['food', 'eritrean']], 'act': 'inform'}],
  'transcript': 'Are there any eritrean restaurants in town?',
  'system_acts': []},
 {'turn_label': [['food', 'chinese']],
  'asr': [['How about Chinese food?']],
  'system_transcript': 'No, there are no eritrean restaurants in town. Would you like a different restaurant? ',
  'turn_idx': 1,
  'belief_state': [{'slots': [['food', 'chinese']], 'act': 'inform'}],
  'transcript': 'How about Chinese food?',
  'system_acts': []},
 {'turn_label': [['area', 'east']],
  'asr': [['I would like the East part of town.']],
  'system_transcript': 'There is a wide variety of Chinese restaurants, do you have an area preference or a price preference to narrow it down?',
  'turn_idx': 2,
  'belief_state': [{'slots': [['food', 'chinese']], 'act': 'inform'},
   {'slots': [['area', 'east']], 

In [4]:
nlg = {
    "en": "You are a helpful AI assistant tasked with generating a response given the current user query and dialogue history.",
    "de": "Sie sind ein hilfreicher KI-Assistent, der die Aufgabe hat, anhand der aktuellen Benutzeranfrage und des Dialogverlaufs eine Antwort zu generieren.",
    "it": "Sei un utile assistente AI incaricato di generare una risposta in base alla query corrente dell'utente e alla cronologia dei dialoghi.",
}

In [7]:
def generate_ft_llm_data(woz_dialogue, task_instruction, mode="train"):
    all_data_list = []
    for dialogue_id, dialogue in enumerate(woz_dialogue[mode]):
        dialogue_data_list = []
        for turn_id, turn in enumerate(dialogue["dialogue"]):
            # print(turn)
            usr = dialogue["dialogue"][turn_id-1]["transcript"]
            sys =  dialogue["dialogue"][turn_id]["system_transcript"]
            if sys and usr:
                sample = {
                    "instruction": task_instruction,
                    "input": usr,
                    "output": sys,
                    "history": []
                }
                dialogue_data_list.append(sample)
        for i, sample in enumerate(dialogue_data_list):
            dialogue_data_list[i]["history"] = [[s["input"], s["output"]] for s in dialogue_data_list[:i]]
        all_data_list.extend(dialogue_data_list)
    return all_data_list

In [8]:
generate_ft_llm_data(woz_dialogue_en, nlg['en'])[:10]

[{'instruction': 'You are a helpful AI assistant tasked with generating a response given the current user query and dialogue history.',
  'input': 'Are there any eritrean restaurants in town?',
  'output': 'No, there are no eritrean restaurants in town. Would you like a different restaurant? ',
  'history': []},
 {'instruction': 'You are a helpful AI assistant tasked with generating a response given the current user query and dialogue history.',
  'input': 'How about Chinese food?',
  'output': 'There is a wide variety of Chinese restaurants, do you have an area preference or a price preference to narrow it down?',
  'history': [['Are there any eritrean restaurants in town?',
    'No, there are no eritrean restaurants in town. Would you like a different restaurant? ']]},
 {'instruction': 'You are a helpful AI assistant tasked with generating a response given the current user query and dialogue history.',
  'input': 'I would like the East part of town.',
  'output': 'Yu Garden is a chin

In [9]:
def upload_to_hub(data, dataset_identifier='Jiahuan/nlg_en'):
    from datasets import load_dataset, DatasetDict, Dataset
    from huggingface_hub import login
    from sklearn.model_selection import train_test_split
    import os

    os.environ['HF_TOKEN'] = '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'
    # Replace 'your_token' with your actual Hugging Face API token
    api_token = os.environ['HF_TOKEN']

    # Log in to the Hugging Face Hub
    login(token=api_token)

    dataset = DatasetDict({
        'train':Dataset.from_list(data['train']), 
        'val':Dataset.from_list(data['val']),
        'test':Dataset.from_list(data['test'])
    })
    dataset.push_to_hub(dataset_identifier)

    # Print some information about the dataset
    print(dataset)

In [10]:
routes = {
    0: '<r0> <en> <en> ',
    1: '<r1> <de> <de> ',
    2: '<r2> <it> <it> ',
    3: '<r3> <en> <de> ', # 3 <en> <en> <en_input>; 3 <de> <en> <en_input>; 3 <de> <de> <de_input> ; 3 <en> <de> <de_input> ;
    4: '<r4> <en> <it> ', # 4 <en> <en> <en_input>; 4 <it> <en> <en_input>; 4 <it> <it> <it_input> ; 4 <en> <it> <it_input> ;
    5: '<r5> <de> <en> ', # 5 <de> <de> <de_input>; 5 <en> <de> <de_input>; 5 <en> <en> <en_input> ; 5 <de> <en> <en_input> ;
    6: '<r6> <de> <it> ', # 6 <de> <de> <de_input>; 6 <it> <de> <de_input>; 6 <it> <it> <it_input> ; 6 <de> <it> <it_input> ;
    7: '<r7> <it> <en> ', # 7 <it> <it> <it_input>; 7 <en> <it> <it_input>; 7 <en> <en> <en_input> ; 7 <it> <en> <en_input> ;
    8: '<r8> <it> <de> '  # 8 <it> <it> <it_input>; 8 <de> <it> <it_input>; 8 <de> <de> <de_input> ; 8 <it> <de> <de_input> ;
}

In [11]:
r = [
    '<r0> <en> <en> ',
    '<r1> <de> <de> ',
    '<r2> <it> <it> ',
    '<r3> <en> <de> ', 
    '<r7> <en> <it>',
    '<r8> <de> <it> '
]

In [12]:
# <r0> <en> <en> 
woz_dialogue_nlg_en = {
    "train": generate_ft_llm_data(woz_dialogue_en, r[0]+nlg['en'], mode="train"),
    "val": generate_ft_llm_data(woz_dialogue_en, r[0]+nlg['en'], mode="validation"),
    "test": generate_ft_llm_data(woz_dialogue_en, r[0]+nlg['en'], mode="test"),
}
upload_to_hub(woz_dialogue_nlg_en, dataset_identifier='Jiahuan/nlg_en')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jpei/.cache/huggingface/token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/595 [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 1936
    })
    val: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 630
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 1246
    })
})


In [13]:
# <r1> <de> <de> 
woz_dialogue_nlg_de = {
    "train": generate_ft_llm_data(woz_dialogue_de, r[1]+nlg['de'], mode="train"),
    "val": generate_ft_llm_data(woz_dialogue_de, r[1]+nlg['de'], mode="validation"),
    "test": generate_ft_llm_data(woz_dialogue_de, r[1]+nlg['de'], mode="test"),
}
upload_to_hub(woz_dialogue_nlg_de, dataset_identifier='Jiahuan/nlg_de')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jpei/.cache/huggingface/token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/595 [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 1925
    })
    val: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 630
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 1246
    })
})


In [14]:
# <r2> <it> <it> 
woz_dialogue_nlg_it = {
    "train": generate_ft_llm_data(woz_dialogue_it, r[2]+nlg['it'], mode="train"),
    "val": generate_ft_llm_data(woz_dialogue_it, r[2]+nlg['it'], mode="validation"),
    "test": generate_ft_llm_data(woz_dialogue_it, r[2]+nlg['it'], mode="test"),
}
upload_to_hub(woz_dialogue_nlg_it, dataset_identifier='Jiahuan/nlg_it')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jpei/.cache/huggingface/token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/595 [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 1936
    })
    val: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 630
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 1246
    })
})


In [18]:
nlg_all = {
    'train': woz_dialogue_nlg_en['train'] + woz_dialogue_nlg_de['train'] + woz_dialogue_nlg_it['train'], 
    'val': woz_dialogue_nlg_en['val'] + woz_dialogue_nlg_de['val'] + woz_dialogue_nlg_it['val'], 
    'test': woz_dialogue_nlg_en['test'] + woz_dialogue_nlg_de['test'] + woz_dialogue_nlg_it['test']
}
upload_to_hub(nlg_all, dataset_identifier='Jiahuan/nlg_all_en_de_it')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jpei/.cache/huggingface/token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 5797
    })
    val: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 1890
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 3738
    })
})


In [24]:
# 3: '<r3> <en> <de> ', 
# 4: '<r7> <en> <it>',
# 5: '<r8> <de> <it> '
def generate_split_data(splits=['train', 'val', 'test']):
    nlg_dataset = {}
    for split in splits:
        nlg_3_data_list = generate_ft_llm_data(woz_dialogue_en, r[0]+nlg['en'], mode=split)
        nlg_3_data_list += generate_ft_llm_data(woz_dialogue_de, r[1]+nlg['de'], mode=split)
        nlg_3_data_list += generate_ft_llm_data(woz_dialogue_it, r[2]+nlg['it'], mode=split)
        nlg_3_data_list += generate_ft_llm_data(woz_dialogue_en, r[3]+nlg['en'], mode=split)
        nlg_3_data_list += generate_ft_llm_data(woz_dialogue_de, r[3]+nlg['de'], mode=split)
        nlg_3_data_list += generate_ft_llm_data(woz_dialogue_en, r[4]+nlg['en'], mode=split)
        nlg_3_data_list += generate_ft_llm_data(woz_dialogue_it, r[4]+nlg['it'], mode=split)
        nlg_3_data_list += generate_ft_llm_data(woz_dialogue_de, r[5]+nlg['de'], mode=split)
        nlg_3_data_list += generate_ft_llm_data(woz_dialogue_it, r[5]+nlg['it'], mode=split)
        if split == 'validation':
            nlg_dataset['val'] = nlg_3_data_list
        else:
            nlg_dataset[split] = nlg_3_data_list
    return nlg_dataset

In [27]:
nlg_mix = generate_split_data(splits=['train', 'validation', 'test'])
upload_to_hub(nlg_mix, dataset_identifier='Jiahuan/nlg_mix_en_de_it')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jpei/.cache/huggingface/token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 17391
    })
    val: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 5670
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'history'],
        num_rows: 11214
    })
})
