# Extract TOMT solved and Gold Answers

## Read Reddit-API dataset of TOMT-subreddit to Pandas dataframe

In [None]:
import pandas as pd
import json
path = '/mnt/ceph/storage/data-in-progress/data-research/web-search/false-memories/reddit-tomt/tomt-dataset-03-12-2022/reddit-tomt-submissions-with-comments.jsonl.gz'


In [None]:
import gzip
with gzip.open(path, 'rt') as f:
    d_all = []
    for line in f:
        obj = json.loads(line)
        d_all.append(obj)
df_orig = pd.DataFrame(d_all)

In [None]:
# Number of TOMT-Q&As
len(df_orig)

## Rename columns in order to adapt to Yahoo-Answers! data structure

In [None]:
df_orig.rename(columns = {'selftext':'content'}, inplace = True)
df_orig.rename(columns = {'title':'subject'}, inplace = True)

In [None]:
# add attributes for solved_utc, chosen_answer and links_on_answer_path
df_orig.insert(4, "solved_utc", "")
df_orig.insert(8, "chosen_answer", "")
df_orig.insert(9, "links_on_answer_path", "")

## Gold Answer Extraction code

In [None]:
import re
links = []
solved_dates = []

In [None]:
def get_links(comment):
    text = str(comment['body'])
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    links.extend(urls)

In [None]:
# returns true if comment meets our Gold Answer criteria
def is_gold_answer(comment, author_q):
    comments = comment['comments']
    j = 0
    for c in comments:
        text_answ = c['body']
        author_answ = c['author']
        text_answ = str(text_answ).lower()
        if "yes" in text_answ or "thank" in text_answ or "solved" in text_answ or "amazing" in text_answ:
            if str(author_q) == str(author_answ):
                solved_dates.append(comment['created_utc'])
                return True
        j += 1
    return False

In [None]:
def find_gold_answer_rek(qa, author):      
    comments = qa['comments']
    for comment in comments:
        if is_gold_answer(comment, author):
            get_links(comment)
            return comment['body']
        child = find_gold_answer_rek(comment, author)
        if child != None:
            # gold answer is located in this comment path
            get_links(comment)
            return child
    return None

In [None]:
def find_gold_answer(qa):
    author = qa['author']
    solved = qa['link_flair_text']
    
    
    # neglect not-"Solved"-labeled questions
    # alternative: only neglect "Open" questions
    #if solved == 'Open':    
    if solved != 'Solved' and solved != 'Solved!':
        return None
    if author == '[deleted]':
        return None
    
    if qa['num_comments'] > 0:
        comments = qa['comments']
        for comment in comments:
            if is_gold_answer(comment, author):
                get_links(comment)
                return comment['body']
            child = find_gold_answer_rek(comment, author)
            if child != None:
                # gold answer is located in this comment path
                get_links(comment)
                return child
    return None

In [None]:
# unify Solved! and Solved labels to "Solved"
df_orig['link_flair_text'] = df_orig['link_flair_text'].replace('Solved!', 'Solved')

In [None]:
# ensure that no question is left with "Solved!" label
df_orig.loc[df_orig['link_flair_text'] == 'Solved!']

In [None]:
# iterate through TOMT subreddit questions and extract Gold Answers

no_solved = 0
no_unsolved = 0

res = []
for i in range(len(df_orig)):
    links = []
    solved_dates = []
    line = df_orig.iloc[i]
    gold_answer = find_gold_answer(line)
    if gold_answer == None:
        no_unsolved += 1
        df_orig.at[i, 'chosen_answer'] = ''
        df_orig.at[i, 'links_on_answer_path'] = []
        df_orig.at[i, 'solved_utc'] = ''
    else:
        no_solved += 1
        df_orig.at[i, 'chosen_answer'] = gold_answer
        df_orig.at[i, 'links_on_answer_path'] = links
        df_orig.at[i, 'solved_utc'] = solved_dates[0]
print(no_solved) #513 484
print(no_unsolved) #765 941

In [None]:
df_orig[['created_utc', 'link_flair_text', 'solved_utc', 'title', 'content', 'chosen_answer', 'links_on_answer_path']]

## Write new reddit tomt dataset with chosen_answer, links_on_answer_path, solved_utc to file

In [None]:
#pathOut = '/mnt/ceph/storage/data-in-progress/data-research/web-search/false-memories/reddit-tomt/tomt-dataset-26-01-2023/reddit-tomt-submissions-with-comments.jsonl.gz'
pathOut = '/mnt/ceph/storage/data-in-progress/data-research/web-search/false-memories/reddit-tomt/tomt-dataset-26-01-2023/reddit-tomt-submissions.jsonl.gz'

In [None]:
import os
import gzip

if os.path.exists(pathOut):
    os.remove(pathOut)

### Write dataset

In [None]:
with gzip.open(pathOut, 'wb') as f_out:
    for _, l in tqdm(df_orig.iterrows()):
        f_out.write((json.dumps(l.to_dict()) + '\n').encode('utf8'))