### Download data

In [None]:
from datasets import load_dataset

ds_train = load_dataset("re_dial", revision="refs/convert/parquet", split="train")
ds_test = load_dataset("re_dial", revision="refs/convert/parquet", split="test")

ds_train.to_pandas().to_parquet("data/raw/train.parquet")
ds_test.to_pandas().to_parquet("data/raw/test.parquet")

### Verify Interim

In [None]:
import pandas as pd

interim_path = 'data/processed/translated_ds_011.parquet'

df = pd.read_parquet(interim_path)
print(f'total de linhas: {len(df)}')

# non_translated_count = df['text_translated'].isna().sum()
# translated_count = len(df) - non_translated_count

# print(f'Total de mensagens traduzidas: {translated_count}')
# print(f'Total de mensagens não traduzidas: {non_translated_count}')


### Split Dataset

In [None]:
import pandas as pd
import numpy as np

# Carregando o dataset

PATH = 'data/raw/ds.parquet'

df = pd.read_parquet(PATH)

# Dividindo o dataframe em 10 partes
splits = np.array_split(df, 10)

# Salvando cada parte como um arquivo parquet separado
for idx, split in enumerate(splits, 1):
    filename = f'data/raw/ds_{idx:03}.parquet'
    split.to_parquet(filename, index=False)
    print(f"Saved {filename}")


### Example generating batchs samples

In [7]:
import os
import re
import pandas as pd

PATH = 'data/processed'
df_train = []
all_movies = []

df = []

for filename in os.listdir(PATH):

    if filename.endswith('.parquet') and filename != 'translated_ds_011.parquet':
        df.append(pd.read_parquet(os.path.join(PATH, filename)))

df = pd.concat(df)

In [9]:
def id2text(text, movies):
    for movie_id in re.findall(r'@\d+', text):
        if movies[movies['movieId'] == movie_id[1:]].empty:
            movie_name = '<unk>'
        else:
            movie_name = movies[movies['movieId'] == movie_id[1:]]['movieName'].iloc[0]
        text = text.replace(movie_id, movie_name)
    return text

In [10]:
movies = pd.DataFrame(df['movieMentions'].explode().drop_duplicates().dropna().reset_index(drop=True).tolist())

### V1 Dataset

In [11]:
#from string import punctuation
#from tqdm import tqdm

#dict_punctuation = {i: j for j, i in enumerate(punctuation)}

#df_train = []
# df = pd.read_parquet('data/processed/translated_ds_011.parquet')

# for _, row in tqdm(df.iterrows(), total=df.shape[0]):
#     df_explode = pd.DataFrame(row[['messages_translated']].explode().tolist())
#     # print(df_explode)
#     df_explode['text'] = df_explode.apply(lambda x: id2text(x['text'], movies), axis=1)
#     #print(df_explode.iloc[0])
#     worker_id = df_explode.iloc[0]['senderWorkerId']
#     instruction = ''
#     response = ''

#     changed = False

#     for index, message in df_explode.iterrows():
        
#         if changed == False:
#             if message['senderWorkerId'] == worker_id:
#                 instruction += message['text']
#                 if instruction[-1] not in dict_punctuation:
#                     instruction+='.'
#             else:
#                 changed = True
#                 response += message['text']
#                 if response[-1] not in dict_punctuation:
#                     response+='.'
#         else:
#             if message['senderWorkerId'] != worker_id:
#                 response += message['text']
#                 if response[-1] not in dict_punctuation:
#                     response+='.'
#             else:
#                 changed = False
#                 df_train.append({'initiator': instruction, 'respondant': response})
#                 response = ''
#                 instruction = message['text']
#                 if instruction[-1] not in dict_punctuation:
#                     instruction+='.'
# df_train = pd.DataFrame(df_train)

# def generate_sample(conversation):
#         return "<|system|>\n Você é um chatbot de recomendação de filmes, converse com o usuário para indicar filmes apropriados.</s>\n<|user|>\n" + conversation['initiator'] + "</s>\n<|assistant|>\n" + conversation['respondant'] + "</s>\n"

# df_train['sample'] = df_train.apply(lambda x: generate_sample(x), axis=1)
# df_train.drop(['initiator', 'respondant'], axis=1, inplace=True)

# df_train.to_parquet('data/processed/test.parquet', index=False)

100%|██████████| 9005/9005 [01:10<00:00, 126.91it/s]


### V2 Dataset

In [46]:
tokenizer.chat_template

"{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

In [53]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

In [54]:
messages = [
    {"role": "system", "content": "Você é um chatbot para indicação de filmes. Responda de maneira educada sugestões de filmes para os usuários."},
    {"role": "user", "content": "Hi there!"},
    {"role": "assistant", "content": "Nice to meet you!"},
    {"role": "user", "content": "Can I ask a question?"}
]

tokenizer.apply_chat_template(messages, tokenize=False)

'<|system|>\nVocê é um chatbot para indicação de filmes. Responda de maneira educada sugestões de filmes para os usuários.</s>\n<|user|>\nHi there!</s>\n<|assistant|>\nNice to meet you!</s>\n<|user|>\nCan I ask a question?</s>\n'

In [39]:
from transformers import AutoTokenizer
from string import punctuation
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

def process_dataset(df, movies):
    
    dataset = []

    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        df_explode = pd.DataFrame(row[['messages_translated']].explode().tolist())
        df_explode['text'] = df_explode.apply(lambda x: id2text(x['text'], movies), axis=1)
        worker_id = df_explode.iloc[0]['senderWorkerId']

        message_template = [{"role": "system", "content": "Você é um chatbot para indicação de filmes. Responda de maneira educada sugestões de filmes para os usuários."}]

        for index, message in df_explode.iterrows():
            
            if message['senderWorkerId'] == worker_id:
                message_template.append({"role": "user", "content": message['text']})
            else:
                message_template.append({"role": "assistant", "content": message['text']})

        dataset.append(tokenizer.apply_chat_template(message_template, tokenize=False))
    
    return dataset

# Train
df_train = process_dataset(df, movies)

# Test
df_test = pd.read_parquet('data/processed/translated_ds_011.parquet')
movies_test = pd.DataFrame(df_test['movieMentions'].explode().drop_duplicates().dropna().reset_index(drop=True).tolist())
df_test = process_dataset(pd.read_parquet('data/processed/translated_ds_011.parquet'), movies_test)

100%|██████████| 1342/1342 [00:08<00:00, 166.81it/s]


In [44]:
pd.DataFrame(df_train, columns=['sample']).to_parquet('data/processed/colab/v2/train.parquet', index=False)
pd.DataFrame(df_test, columns=['sample']).to_parquet('data/processed/colab/v2/test.parquet', index=False)