### Download data

In [None]:
from datasets import load_dataset

ds_train = load_dataset("re_dial", revision="refs/convert/parquet", split="train")
ds_test = load_dataset("re_dial", revision="refs/convert/parquet", split="test")

ds_train.to_pandas().to_parquet("data/raw/train.parquet")
ds_test.to_pandas().to_parquet("data/raw/test.parquet")

### Verify Interim

In [None]:
import pandas as pd

interim_path = 'data/processed/translated_ds_011.parquet'

df = pd.read_parquet(interim_path)
print(f'total de linhas: {len(df)}')

# non_translated_count = df['text_translated'].isna().sum()
# translated_count = len(df) - non_translated_count

# print(f'Total de mensagens traduzidas: {translated_count}')
# print(f'Total de mensagens não traduzidas: {non_translated_count}')


In [None]:
def reaggregate_messages_and_translation(group):

    messages = group.apply(
        lambda x: {
            "timeOffset": x['messages']["timeOffset"],
            "text": x['messages']["text"],
            "senderWorkerId": x['messages']["senderWorkerId"],
            "messageId": x['messages']["messageID"],
        },
        axis=1,
    ).tolist()

    messages_translated = group.apply(
        lambda x: {
            "timeOffset": x["timeOffset"],
            "text": x["text_translated"],
            "senderWorkerId": x["senderWorkerId"],
            "messageId": x["messageID"],
        },
        axis=1,
    ).tolist()

    # Colunas constantes dentro de cada grupo
    constant_values = {
        "movieMentions": group["movieMentions"].iloc[0],
        "respondentQuestions": group["respondentQuestions"].iloc[0],
        "respondentWorkerId": group["respondentWorkerId"].iloc[0],
        "initiatorWorkerId": group["initiatorWorkerId"].iloc[0],
        "initiatorQuestions": group["initiatorQuestions"].iloc[0],
    }

    return pd.Series(
        {
            "messages": messages,
            "messages_translated": messages_translated,
            **constant_values,
        }
    )


def reconstruct_dataset(df_messages: pd.DataFrame, number: int) -> pd.DataFrame:
    reconstructed_df = (
        df_messages.groupby("conversationId")
        .apply(reaggregate_messages_and_translation)
        .reset_index()
    )
    reconstructed_df.to_parquet(
        f"data/processed/translated_ds_{number:03}.parquet", index=False
    )

### Split Dataset

In [None]:
import pandas as pd
import numpy as np

# Carregando o dataset

PATH = 'data/raw/ds.parquet'

df = pd.read_parquet(PATH)

# Dividindo o dataframe em 10 partes
splits = np.array_split(df, 10)

# Salvando cada parte como um arquivo parquet separado
for idx, split in enumerate(splits, 1):
    filename = f'data/raw/ds_{idx:03}.parquet'
    split.to_parquet(filename, index=False)
    print(f"Saved {filename}")


### Example generating batchs samples

In [157]:
# split question x response explode

# iterate data/processed with .parquet end
import os
import re
import pandas as pd

PATH = 'data/processed'
df_train = []
all_movies = []

df = []

for filename in os.listdir(PATH):

    if filename.endswith('.parquet') and filename != 'translated_ds_011.parquet':
        df.append(pd.read_parquet(os.path.join(PATH, filename)))

df = pd.concat(df)

In [158]:
def id2text(text, movies):
    for movie_id in re.findall(r'@\d+', text):
        if movies[movies['movieId'] == movie_id[1:]].empty:
            movie_name = '<unk>'
        else:
            movie_name = movies[movies['movieId'] == movie_id[1:]]['movieName'].iloc[0]
        text = text.replace(movie_id, movie_name)
    return text

In [159]:
movies = pd.DataFrame(df['movieMentions'].explode().drop_duplicates().dropna().reset_index(drop=True).tolist())

In [170]:
from string import punctuation
from tqdm import tqdm

dict_punctuation = {i: j for j, i in enumerate(punctuation)}

df_train = []
df = pd.read_parquet('data/processed/translated_ds_011.parquet')

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    df_explode = pd.DataFrame(row[['messages_translated']].explode().tolist())
    #print(df_explode)
    df_explode['text'] = df_explode.apply(lambda x: id2text(x['text'], movies), axis=1)
    #print(df_explode.iloc[0])
    worker_id = df_explode.iloc[0]['senderWorkerId']
    instruction = ''
    response = ''

    changed = False

    # iterate over messages in conversation
    for index, message in df_explode.iterrows():
        
        if changed == False:
            if message['senderWorkerId'] == worker_id:
                instruction += message['text']
                if instruction[-1] not in dict_punctuation:
                    instruction+='.'
            else:
                changed = True
                response += message['text']
                if response[-1] not in dict_punctuation:
                    response+='.'
        else:
            if message['senderWorkerId'] != worker_id:
                response += message['text']
                if response[-1] not in dict_punctuation:
                    response+='.'
            else:
                changed = False
                df_train.append({'initiator': instruction, 'respondant': response})
                response = ''
                instruction = message['text']
                if instruction[-1] not in dict_punctuation:
                    instruction+='.'

            # df_train.append({'question': instruction, 'response': message['text']})
            # instruction = ''

  0%|          | 0/1342 [00:00<?, ?it/s]

100%|██████████| 1342/1342 [00:11<00:00, 115.19it/s]


In [171]:
df_train = pd.DataFrame(df_train)

In [172]:
df_train.head()

Unnamed: 0,initiator,respondant
0,"Olá, estou procurando por um filme como o Super Troopers (2001).",Você deveria assistir Police Academy (1984).
1,É um grande? Eu nunca vi isso. Eu já vi American Pie .Eu quero dizer American Pie (1999),"Sim, Police Academy (1984) é muito engraçado e Police Academy 2: Their First Assignment (1985) também."
2,Parece que eu preciso dar uma olhada neles.,"Sim, você vai gostar deles."
3,Eu agradeço seu tempo. Eu precisarei dar uma olhada nisso. Existem outros que você recomendaria?,Sim Lethal Weapon (1987)
4,"Obrigado, eu também vou assistir isso.",E também Beverly Hills Cop (1984)


In [173]:
def generate_sample(conversation):
        return "<|system|>\n Você é um chatbot de recomendação de filmes, converse com o usuário para indicar filmes apropriados.</s>\n<|user|>\n" + conversation['initiator'] + "</s>\n<|assistant|>\n" + conversation['respondant'] + "</s>\n"


In [174]:
df_train['sample'] = df_train.apply(lambda x: generate_sample(x), axis=1)
df_train.drop(['initiator', 'respondant'], axis=1, inplace=True)

In [175]:
df_train.to_parquet('data/processed/test.parquet', index=False)

In [176]:
df_train.shape

(6857, 1)

In [178]:
df_train['sample'].tolist()[0]

'<|system|>\n Você é um chatbot de recomendação de filmes, converse com o usuário para indicar filmes apropriados.</s>\n<|user|>\nOlá, estou procurando por um filme como o Super Troopers (2001).</s>\n<|assistant|>\nVocê deveria assistir Police Academy  (1984).</s>\n'