# Evaluation

In [1]:
import pandas as pd
import os
import re

df = pd.read_parquet("data/processed/colab/v2/translated_ds_011.parquet")

### Example generating batchs samples

In [2]:
def id2text(text, movies):
    for movie_id in re.findall(r"@\d+", text):
        if movies[movies["movieId"] == movie_id[1:]].empty:
            movie_name = "<unk>"
        else:
            movie_name = movies[movies["movieId"] == movie_id[1:]]["movieName"].iloc[0]
        text = text.replace(movie_id, movie_name)
    return text

In [3]:
movies = pd.DataFrame(
    df["movieMentions"]
    .explode()
    .drop_duplicates()
    .dropna()
    .reset_index(drop=True)
    .tolist()
)

In [4]:
df_messages = df.explode("messages_translated")

df_messages["timeOffset"] = df_messages["messages_translated"].apply(
    lambda x: x["timeOffset"]
)
df_messages["text"] = df_messages["messages_translated"].apply(lambda x: x["text"])
df_messages["senderWorkerId"] = df_messages["messages_translated"].apply(
    lambda x: x["senderWorkerId"]
)
df_messages["messageID"] = df_messages["messages_translated"].apply(
    lambda x: x["messageId"]
)
df_messages.drop(
    columns=[
        "messages",
        "messageID",
        "messages_translated",
        "movieMentions",
        "respondentQuestions",
        "initiatorWorkerId",
        "initiatorQuestions",
        "timeOffset",
    ],
    inplace=True,
)

df_messages

Unnamed: 0,conversationId,respondentWorkerId,text,senderWorkerId
0,20001,957,"Olá, estou procurando por um filme como o @111...",956
0,20001,957,Você deveria assistir @151656.,957
0,20001,957,É um grande? Eu nunca vi isso. Eu já vi @192131.,956
0,20001,957,Eu quero dizer @134643,956
0,20001,957,"Sim, @151656 é muito engraçado e @94688 também.",957
...,...,...,...,...
1341,23322,1082,@177387,1082
1341,23322,1082,"Obrigado pelas suas recomendações, tenha um bo...",1084
1341,23322,1082,Foi um prazer compartilhar filmes com você. Te...,1082
1341,23322,1082,Adeus,1082


In [25]:
_df = df_messages.copy()
_df['has_movie'] = df_messages['text'].apply(lambda x: has_movie(x))
_df = _df[_df['has_movie'].str.len() > 3]

# left join df_messages and _df

final_df = df_messages[df_messages['conversationId'].isin(_df['conversationId'].unique())]
final_df.drop_duplicates().reset_index(drop=True)

Unnamed: 0,conversationId,respondentWorkerId,text,senderWorkerId
0,20041,958,Olá!,958
1,20041,958,Olá!,959
2,20041,958,Qual tipo de filmes você gosta?,958
3,20041,958,Procuro uma recomendação de filme. Quando eu e...,959
4,20041,958,"Oh, você gosta de filmes de terror?",958
...,...,...,...,...
423,23191,1059,Eu gosto do Christian Bale.,1093
424,23191,1059,"Bem, obrigado pelas sugestões.",1093
425,23191,1059,Você é bem-vindo,1059
426,23191,1059,Adeus,1059


### Filtering data

In [None]:
from string import punctuation
from tqdm import tqdm

dict_punctuation = {i: j for j, i in enumerate(punctuation)}

df_test = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    df_explode = pd.DataFrame(row[["messages_translated"]].explode().tolist())
    # df_explode['text'] = df_explode.apply(lambda x: id2text(x['text'], movies), axis=1)
    worker_id = df_explode.iloc[0]["senderWorkerId"]
    instruction = ""
    response = ""

    changed = False

    for index, message in df_explode.iterrows():
        if changed == False:
            if message["senderWorkerId"] == worker_id:
                instruction += message["text"]
                if instruction[-1] not in dict_punctuation:
                    instruction += "."
            else:
                changed = True
                response += message["text"]
                if response[-1] not in dict_punctuation:
                    response += "."
        else:
            if message["senderWorkerId"] != worker_id:
                response += message["text"]
                if response[-1] not in dict_punctuation:
                    response += "."
            else:
                changed = False
                df_test.append({"initiator": instruction, "respondant": response})
                response = ""
                instruction = message["text"]
                if instruction[-1] not in dict_punctuation:
                    instruction += "."
df_test = pd.DataFrame(df_test)


# df_test.drop(['initiator', 'respondant'], axis=1, inplace=True)

In [7]:
import re


def has_movie(text):
    return re.findall(r"@\d+", text)


def get_movie(list_movies, movies):
    movies_text = []
    # print(list_movies)
    for movie_id in list_movies:
        if movie_id == None:
            movies_text.append("")
        else:
            if movies[movies["movieId"] == movie_id[1:]].empty:
                movie_name = "<unk>"
            else:
                movie_name = movies[movies["movieId"] == movie_id[1:]][
                    "movieName"
                ].iloc[0]
            movies_text.append(movie_name)
    return movies_text

In [None]:
df_test["has_movie"] = df_test.apply(lambda x: has_movie(x["respondant"]), axis=1)
df_test["initiator"] = df_test.apply(lambda x: id2text(x["initiator"], movies), axis=1)
df_test["respondant"] = df_test.apply(
    lambda x: id2text(x["respondant"], movies), axis=1
)
df_test["has_movie"] = df_test.apply(
    lambda x: get_movie(x["has_movie"], movies), axis=1
)

In [None]:
df_test

In [None]:
_df = df_test[df_test["has_movie"].str.len() != 0]

In [None]:
_df[_df["has_movie"].str.len() > 3]

In [None]:
pd.DataFrame(df_test, columns=["sample"]).to_parquet(
    "data/processed/colab/v2/train.parquet", index=False
)
pd.DataFrame(df_test, columns=["sample"]).to_parquet(
    "data/processed/colab/v2/test.parquet", index=False
)

In [None]:
pd.read_parquet("data/processed/colab/v2/test.parquet").shape