# Text Encoding (reddit post titles)

### *Advice: run this code in a gpu-enabled environment*

## Data Loading

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
import pandas as pd

In [10]:
df_titles = pd.read_csv('/content/gdrive/MyDrive/ColabNotebooks/reddit-morality/Titles__(Sub_AITA)__(Redditor_active).tsv', sep='\t')

df_titles.dropna(inplace=True)
df_titles.drop_duplicates(inplace=True)

deleted_post_indices = [idx for idx, elem in zip(df_titles.index.tolist(), df_titles['title'].tolist()) if '[deleted by user]' in elem]
df_titles.drop(deleted_post_indices, inplace=True)

# df_titles: ['id', 'title', 'selftext']

# sanity check
assert len(df_titles) == len(list(set(df_titles['id'].tolist())))

In [11]:
def strip_prompt(text, prompt):    
    return text[len(prompt):] if text.lower().startswith(prompt.lower()) else text


def preproc_title(text):
    prompt_list = ['AITA for', 'AITA if', 'AITA when', 'AITA -', 'AITA-', 'AITA :', 'AITA:',
                   'WIBTA for', 'WIBTA if', 'WIBTA when', 'WIBTA -', 'WIBTA-', 'WIBTA :', 'WIBTA:',
                   'AITA', 'WIBTA']

    for _prompt in prompt_list:
        text = strip_prompt(text, _prompt)

    text = text.replace('AITA','').replace('WIBTA','').replace('?','.').strip()
    return text

In [12]:
all_ids = df_titles['id'].tolist()
all_situations = [preproc_title(elem) for elem in df_titles['title'].tolist()]

assert len(all_ids) == len(all_situations)
print('number of text to encode: %d'%len(all_situations))

number of text to encode: 59224


## Encode with sentence-transformers

In [None]:
!pip install -U sentence-transformers

In [15]:
import torch
import numpy as np
from collections import Counter
from tqdm import tqdm

from sentence_transformers import SentenceTransformer

In [16]:
# Setting up the device for GPU usage
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Current device:",device)

Current device: cuda


In [None]:
smodel = SentenceTransformer('all-MiniLM-L12-v2', device=device)

In [18]:
# Encode situations

batch_size = 32
num_steps = int(len(all_situations) / batch_size)

situ_reps = []
for idx in tqdm(range(num_steps)):
    curr_sents = all_situations[idx*batch_size:(idx+1)*batch_size] if idx < num_steps-1 else all_situations[idx*batch_size:]
    embeddings = smodel.encode(curr_sents)
    if len(embeddings.shape) == 1:
        embeddings = np.reshape(embeddings, (1, embeddings.shape[0]))
    situ_reps.append(embeddings)

situ_reps = np.concatenate(situ_reps)

100%|██████████| 1850/1850 [01:01<00:00, 30.07it/s]


In [19]:
import pickle
pickle.dump(all_ids, open('/content/gdrive/MyDrive/ColabNotebooks/reddit-morality/SubIDs__(Sub_AITA)__(Redditor_active).pkl', 'wb'))
pickle.dump(all_situations, open('/content/gdrive/MyDrive/ColabNotebooks/reddit-morality/Situations__(Sub_AITA)__(Redditor_active).pkl', 'wb'))

with open('/content/gdrive/MyDrive/ColabNotebooks/reddit-morality/SituationEncoding__(Sub_AITA)__(Redditor_active)__sbert.npy', 'wb') as f:
    np.save(f, situ_reps)