In [1]:
from processing import (
    extract_domain_features,
    add_processed_essay_columns,
    process_essay_with_conclusion,
)
import pandas as pd
pd.set_option("display.max_columns", None)
import os
import transformers
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer("all-mpnet-base-v2")

In [3]:
df_train, df_valid, df_test = (
    pd.read_parquet("../02_output/train.parquet"),
    pd.read_parquet("../02_output/valid.parquet"),
    pd.read_parquet("../02_output/test.parquet"),
)

In [4]:
from tqdm.auto import tqdm

dataframes = {'train': df_train, 'valid': df_valid, 'test': df_test}

for name, df in tqdm(dataframes.items(), desc="Processing dataframes"):
    dataframes[name] = add_processed_essay_columns(df, "essay")
    dataframes[name] = extract_domain_features(dataframes[name])

# Assign back to original variables
df_train, df_valid, df_test = dataframes['train'], dataframes['valid'], dataframes['test']

Processing dataframes: 100%|██████████| 3/3 [00:01<00:00,  1.89it/s]


In [7]:
def generate_embeddings(df, model, columns_to_encode):
    """
    generate embeddings
    
    Args:
        df: pandas DataFrame containing text columns
        model: SentenceTransformer model
        columns_to_encode: list of column names to generate embeddings for
    
    Returns:
        DataFrame with added embedding columns
    """
    df_copy = df.copy()
    for column in columns_to_encode:
        embedding_col_name = f"{column}_embedding"
        df_copy[embedding_col_name] = df_copy[column].apply(
            lambda x: model.encode(x, show_progress_bar=False, convert_to_tensor=False)
        )
    
    return df_copy

In [8]:
columns_to_encode = ["essay_full", "essay", "essay_conclusion", "prompt"]

dataframes["train"] = generate_embeddings(dataframes["train"], model, columns_to_encode)
dataframes["valid"] = generate_embeddings(dataframes["valid"], model, columns_to_encode)
dataframes["test"] = generate_embeddings(dataframes["test"], model, columns_to_encode)

# update the previosly dataframes
df_train, df_valid, df_test = dataframes["train"], dataframes["valid"], dataframes["test"]

In [9]:
# save dataframes as parquet
df_train.to_csv("../02_output/train_embeddings.csv", index=False)
df_valid.to_csv("../02_output/valid_embeddings.csv", index=False)
df_test.to_csv("../02_output/test_embeddings.csv", index=False)

In [10]:
df_train.head(2)

Unnamed: 0,prompt_id,title,essay,score,prompt,comp1,comp2,comp3,comp4,comp5,essay_full,essay_conclusion,first_person_total,enclisis_count,demonstrative_pronouns,tokens_count,first_person_per_token,enclisis_per_token,demonstrative_per_token,essay_full_embedding,essay_embedding,essay_conclusion_embedding,prompt_embedding
0,60,Reforma da previdência,[É notório que a reforma da previdência no Bra...,440,"Reforma da Previdência Social, ou simplesmente...",120,80,80,120,40,É notório que a reforma da previdência no Bras...,Colocando-se na balança os prós e os contras p...,0,0,1,149,0.0,0.0,0.006711,"[-0.056441665, 0.029744646, 0.017729225, -0.00...","[[-0.04907515, 0.00093634916, -0.005272345, -0...","[-0.041077167, -0.018350447, -0.008325448, -0....","[-0.06447542, -0.013312494, 0.022696443, -0.00..."
1,51,ANALFABETISMO E SOCIEDADE,"[Pode -se afirmar que a presença dos Jesuítas,...",480,São chamados de analfabetos funcionais os indi...,120,120,80,80,80,"Pode -se afirmar que a presença dos Jesuítas, ...",Diante a suma o Ministério da Educação deve im...,0,0,1,191,0.0,0.0,0.005236,"[-0.012735645, -0.0073190755, 0.021515086, -0....","[[0.0025596255, 0.020569874, 0.011128441, 0.00...","[-0.0050364053, -0.015260002, -0.012094379, -0...","[-0.024492957, -0.053801607, -0.030142495, 0.0..."
