In [1]:
import os
import pickle
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

# Read Data

In [2]:
all_posts_df = pd.read_csv('../data/merged_q/all_posts_max_len_40_labeled.csv')
all_replies_df = pd.read_csv('../data/merged_q/all_replies_max_len_40_labeled.csv')
print('all_posts_df shape = {}'.format(all_posts_df.shape))
print('all_replies_df shape = {}'.format(all_replies_df.shape))

all_posts_df shape = (152680, 6)
all_replies_df shape = (838785, 6)


In [3]:
all_posts_embed = np.load('../data/merged_q/all_posts_max_len_40_embed.npy')
all_replies_embed = np.load('../data/merged_q/all_replies_max_len_40_embed.npy')
print('all_posts_embed shape = {}'.format(all_posts_embed.shape))
print('all_replies_embed shape = {}'.format(all_replies_embed.shape))

all_posts_embed shape = (152680, 768)
all_replies_embed shape = (838785, 768)


In [4]:
with open('../data/merged_q/all_posts_max_len_40_clusters/all_posts_clusters.pickle', 'rb') as f:
    all_posts_clusters = pickle.load(f)
print('Num of post clusters with threshold 0.85 = {}'.format(len(all_posts_clusters[0.85])))

Num of post clusters with threshold 0.85 = 6902


In [5]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_combine_1_centroid.pickle', 'rb') as f:
    all_replies_clusters = pickle.load(f)
print('Num of reply clusters with threshold 0.80 = {}'.format(len(all_replies_clusters[0.8])))

Num of reply clusters with threshold 0.80 = 35542


In [6]:
test_posts_indices = np.load('../data/test/posts_indices.npy')
test_posts_df = pd.read_csv('../data/test/posts_labeled_41.csv')
assert test_posts_indices.shape[0] == test_posts_df.shape[0]
test_posts_df

Unnamed: 0,id,summarized,text,root,length,emotion
0,r2w43k,0,Do you ever feel like everyone else has it tog...,feel,13,questioning
1,r2qqyz,0,I've finally figured out the answer to the que...,figure,21,questioning
2,r2doqc,0,I cooked the entree for my Friendsgiving for t...,cook,14,proud
3,r2cedp,1,"At my job where I work with kids 5-13, I start...",start,23,prepared
4,r1zcsu,1,Having enough food and water.,have,6,prepared
...,...,...,...,...,...,...
15207,hit:12318_conv:24637_uttr:1,0,"I don't like scammy sales people, nothing good...",come,13,trusting
15208,hit:12384_conv:24768_uttr:1,0,"I almost got caught stealing, I felt pretty bad",feel,10,guilty
15209,hit:12390_conv:24781_uttr:1,0,Some random dude walked into my room while I w...,walk,19,surprised
15210,hit:12398_conv:24797_uttr:1,1,I saw my favourite candy at the store today!,see,10,excited


In [7]:
test_replies_indices = np.load('../data/test/replies_indices.npy')
test_replies_df = pd.read_csv('../data/test/replies_labeled_41.csv')
assert test_replies_indices.shape[0] == test_replies_df.shape[0]
test_replies_df

Unnamed: 0,id,parent_id,summarized,text,length,emotion
0,hm61y7k,r2qqyz,0,"Man, I wouldn't know where to get started.",11,acknowledging
1,hm63dh1,r2qqyz,1,"Having said that, The Nice Guys would always b...",14,agreeing
2,hm70wwb,r2w43k,1,I hope this kinda helps.,6,encouraging
3,hm7608z,r2w43k,1,One of the gifts of middle age is that I have ...,18,jealous
4,hm49qhc,r2doqc,1,Happy Thanksgiving :),3,wishing
...,...,...,...,...,...,...
82279,hit:12318_conv:24637_uttr:2,hit:12318_conv:24637_uttr:1,0,"I agree, were you recently screwed by one?",10,questioning
82280,hit:12384_conv:24768_uttr:2,hit:12384_conv:24768_uttr:1,1,Ouch!,2,disgusted
82281,hit:12390_conv:24781_uttr:2,hit:12390_conv:24781_uttr:1,1,Oh wow that is weird.,6,surprised
82282,hit:12398_conv:24797_uttr:2,hit:12398_conv:24797_uttr:1,1,Oh nice.,3,acknowledging


# Preprocess

In [8]:
test_posts_embed = all_posts_embed[test_posts_indices]
print(test_posts_embed.shape)

(15212, 768)


In [9]:
other_posts_indices = set(range(all_posts_df.shape[0])) - set(test_posts_indices)
print(len(other_posts_indices))

137468


In [10]:
minion_posts_indices = set()
for c in all_posts_clusters[0.85]:
    minion_posts_indices = minion_posts_indices.union(set(c[1:]))
print(len(minion_posts_indices))

18619


In [11]:
other_posts_indices_c = np.sort(list(other_posts_indices - minion_posts_indices))
print(other_posts_indices_c.shape)

(120675,)


In [12]:
other_posts_embed = all_posts_embed[other_posts_indices_c]
print(other_posts_embed.shape)

(120675, 768)


In [13]:
all_posts_clusters_dict = {}
for c in all_posts_clusters[0.85]:
    all_posts_clusters_dict[c[0]] = c

In [14]:
reply_degrees = {reply_id: 1 for reply_id in all_replies_df['id'].tolist()}
for c in tqdm(all_replies_clusters[0.8]):
    assert len(c) >= 2
    for i in c:
        reply_degrees[all_replies_df.iloc[i]['id']] = len(c)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35542/35542 [00:06<00:00, 5110.14it/s]


# Calculate Similarity

In [15]:
# cos_sim = cosine_similarity(test_posts_embed, other_posts_embed)

In [16]:
# cos_sim.shape

In [17]:
# np.save('../data/test/result/posts_cos_sim.npy', cos_sim)
cos_sim = np.load('../data/test/result/posts_cos_sim.npy')

In [18]:
# cos_sim_top_idx = np.argsort(cos_sim, axis = 1)[:,-1]
# np.save('../data/test/result/posts_cos_sim_top_idx.npy', cos_sim_top_idx)
cos_sim_top_idx = np.load('../data/test/result/posts_cos_sim_top_idx.npy')

In [19]:
# cos_sim_top_idx_orig = other_posts_indices_c[cos_sim_top_idx]
# np.save('../data/test/result/posts_cos_sim_top_idx_orig.npy', cos_sim_top_idx_orig)
cos_sim_top_idx_orig = np.load('../data/test/result/posts_cos_sim_top_idx_orig.npy')

# Generate All Responses

In [25]:
print(test_posts_df.iloc[3]['text'])
print(all_posts_df.iloc[cos_sim_top_idx_orig[3]]['text'])

At my job where I work with kids 5-13, I started one day a week to paint their nails.
Last friday I found out my job don't mind men painting their nails and now I really want to paint it.


In [26]:
result_dict = {}
result_cols = ['post_id', 'post', 'most_sim_post_id', 'most_sim_post_emo', 'most_sim_post', 'cos_sim',
               'highest_degree_id', 'highest_degree', 'random_id', 'random',
               'intent_id', 'intent_emo', 'intent',
               'follow_id', 'follow_emo', 'follow']
for col in result_cols:
    result_dict[col] = []

In [27]:
emp_intents = ['agreeing', 'acknowledging', 'encouraging', 'consoling',
               'sympathizing', 'suggesting', 'questioning', 'wishing']

In [28]:
emo_groups = [['prepared', 'confident', 'proud'], ['content', 'hopeful', 'anticipating'],
              ['joyful', 'excited'], ['caring'], ['faithful', 'trusting', 'grateful'],
              ['jealous', 'annoyed', 'angry', 'furious'], ['terrified', 'afraid', 'anxious', 'apprehensive'],
              ['disgusted'], ['ashamed', 'guilty', 'embarrassed'],
              ['devastated', 'sad', 'disappointed', 'nostalgic', 'lonely'],
              ['surprised'], ['impressed'], ['sentimental'], ['neutral'], ['agreeing', 'acknowledging'],
              ['encouraging'], ['consoling', 'sympathizing'], ['suggesting'], ['questioning'], ['wishing']]
emo_group_dict = {}
for g in emo_groups:
    for emo in g:
        emo_group_dict[emo] = [x for x in g if x != emo]
emo_group_dict

{'prepared': ['confident', 'proud'],
 'confident': ['prepared', 'proud'],
 'proud': ['prepared', 'confident'],
 'content': ['hopeful', 'anticipating'],
 'hopeful': ['content', 'anticipating'],
 'anticipating': ['content', 'hopeful'],
 'joyful': ['excited'],
 'excited': ['joyful'],
 'caring': [],
 'faithful': ['trusting', 'grateful'],
 'trusting': ['faithful', 'grateful'],
 'grateful': ['faithful', 'trusting'],
 'jealous': ['annoyed', 'angry', 'furious'],
 'annoyed': ['jealous', 'angry', 'furious'],
 'angry': ['jealous', 'annoyed', 'furious'],
 'furious': ['jealous', 'annoyed', 'angry'],
 'terrified': ['afraid', 'anxious', 'apprehensive'],
 'afraid': ['terrified', 'anxious', 'apprehensive'],
 'anxious': ['terrified', 'afraid', 'apprehensive'],
 'apprehensive': ['terrified', 'afraid', 'anxious'],
 'disgusted': [],
 'ashamed': ['guilty', 'embarrassed'],
 'guilty': ['ashamed', 'embarrassed'],
 'embarrassed': ['ashamed', 'guilty'],
 'devastated': ['sad', 'disappointed', 'nostalgic', 'lonely

In [29]:
for i in tqdm(range(test_posts_df.shape[0])):
    post_id = test_posts_df.iloc[i]['id']
    post = test_posts_df.iloc[i]['text']
    most_sim_post_idx = cos_sim_top_idx_orig[i]
    most_sim_post_id = all_posts_df.iloc[most_sim_post_idx]['id']
    most_sim_post_emo = all_posts_df.iloc[most_sim_post_idx]['emotion']
    most_sim_post = all_posts_df.iloc[most_sim_post_idx]['text']
    cos_sim_val = cos_sim[i,cos_sim_top_idx[i]]
    
    result_dict['post_id'].append(post_id)
    result_dict['post'].append(post)
    result_dict['most_sim_post_id'].append(most_sim_post_id)
    result_dict['most_sim_post_emo'].append(most_sim_post_emo)
    result_dict['most_sim_post'].append(most_sim_post)
    result_dict['cos_sim'].append(cos_sim_val)

    if most_sim_post_idx in all_posts_clusters_dict:
        posts_idx = all_posts_clusters_dict[most_sim_post_idx]
    else:
        posts_idx = [most_sim_post_idx]

    posts_ids = all_posts_df.iloc[posts_idx]['id'].tolist()
    cand_replies_df = all_replies_df[all_replies_df['parent_id'].isin(posts_ids)].copy()

    cand_replies_df['degree'] = [reply_degrees[cand_replies_df.iloc[j]['id']] for j in range(cand_replies_df.shape[0])]
    
    # Highest degree and random
    cand_replies_df = cand_replies_df.sort_values(by = 'degree', ascending = False)
    
    d = cand_replies_df.iloc[0]['degree']
    j = 1
    while j < cand_replies_df.shape[0]:
        if cand_replies_df.iloc[j]['degree'] < d:
            break
        j += 1

    chosen_idx_hd = np.random.choice(j, 1, replace = False)[0]
    chosen_idx_rand = np.random.choice(cand_replies_df.shape[0], 1, replace = False)[0]

    result_dict['highest_degree_id'].append(cand_replies_df.iloc[chosen_idx_hd]['id'])
    result_dict['highest_degree'].append(cand_replies_df.iloc[chosen_idx_hd]['text'])
    result_dict['random_id'].append(cand_replies_df.iloc[chosen_idx_rand]['id'])
    result_dict['random'].append(cand_replies_df.iloc[chosen_idx_rand]['text'])
    
    # Empathetic intents and follow emotions
    cand_replies_intent_df = cand_replies_df[cand_replies_df['emotion'].isin(emp_intents)]
    if cand_replies_intent_df.shape[0] != 0:
        chosen_idx = np.random.choice(cand_replies_intent_df.shape[0], 1, replace = False)[0]
        reply_intent = cand_replies_intent_df.iloc[chosen_idx]['text']
        emo_intent = cand_replies_intent_df.iloc[chosen_idx]['emotion']
        id_intent = cand_replies_intent_df.iloc[chosen_idx]['id']
    else:
        chosen_idx = np.random.choice(cand_replies_df.shape[0], 1, replace = False)[0]
        reply_intent = cand_replies_df.iloc[chosen_idx]['text']
        emo_intent = cand_replies_df.iloc[chosen_idx]['emotion']
        id_intent = cand_replies_df.iloc[chosen_idx]['id']
    
    cand_replies_same_emo_df = cand_replies_df[cand_replies_df['emotion'] == most_sim_post_emo]
    cand_replies_sim_emo_df = cand_replies_df[cand_replies_df['emotion'].isin(emo_group_dict[most_sim_post_emo])]
    if cand_replies_same_emo_df.shape[0] != 0:
        chosen_idx = np.random.choice(cand_replies_same_emo_df.shape[0], 1, replace = False)[0]
        reply_follow = cand_replies_same_emo_df.iloc[chosen_idx]['text']
        emo_follow = cand_replies_same_emo_df.iloc[chosen_idx]['emotion']
        id_follow = cand_replies_same_emo_df.iloc[chosen_idx]['id']
    elif cand_replies_sim_emo_df.shape[0] != 0:
        chosen_idx = np.random.choice(cand_replies_sim_emo_df.shape[0], 1, replace = False)[0]
        reply_follow = cand_replies_sim_emo_df.iloc[chosen_idx]['text']
        emo_follow = cand_replies_sim_emo_df.iloc[chosen_idx]['emotion']
        id_follow = cand_replies_sim_emo_df.iloc[chosen_idx]['id']
    else:
        chosen_idx = np.random.choice(cand_replies_df.shape[0], 1, replace = False)[0]
        reply_follow = cand_replies_df.iloc[chosen_idx]['text']
        emo_follow = cand_replies_df.iloc[chosen_idx]['emotion']
        id_follow = cand_replies_df.iloc[chosen_idx]['id']

    result_dict['intent_id'].append(id_intent)
    result_dict['intent_emo'].append(emo_intent)
    result_dict['intent'].append(reply_intent)
    result_dict['follow_id'].append(id_follow)
    result_dict['follow_emo'].append(emo_follow)
    result_dict['follow'].append(reply_follow)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15212/15212 [03:13<00:00, 78.44it/s]


In [30]:
result_df = pd.DataFrame(result_dict)
result_df

Unnamed: 0,post_id,post,most_sim_post_id,most_sim_post_emo,most_sim_post,cos_sim,highest_degree_id,highest_degree,random_id,random,intent_id,intent_emo,intent,follow_id,follow_emo,follow
0,r2w43k,Do you ever feel like everyone else has it tog...,79pxlq,jealous,I feel like everyone else has their life more ...,0.761634,dp3wsyf,Good luck my dude,dp3vg2a,I feel like you have a more together life than...,dp3wsyf,wishing,Good luck my dude,dp3vowt,jealous,This lack of self acceptance I think also feed...
1,r2qqyz,I've finally figured out the answer to the que...,dce0wm,questioning,"So, what’re your favorite movies?",0.681865,f27kx8t,Hot Rod.,f27oy5w,My favorite is Tangled,f27l0mh,acknowledging,"Hard to pick one or rank them, because movies ...",f27kx8t,questioning,Hot Rod.
2,r2doqc,I cooked the entree for my Friendsgiving for t...,cfmhp5,proud,I cooked a real meal last night for the first ...,0.770251,eubc9si,Congratulations!,eub2smt,Homemade food always tastes better as well.,eubm8x1,agreeing,"Bamboo is also a good substitute for meat, the...",eucjcz4,proud,"Professional chef here,."
3,r2cedp,"At my job where I work with kids 5-13, I start...",omtovj,jealous,Last friday I found out my job don't mind men ...,0.583775,h5nqfyp,Good luck!,h5o2hge,It makes the nail polish easier to scrape off.,h5nuvm6,acknowledging,Enjoy your new fancy nails!,h5rkkub,jealous,Also it's a great way to bond with a female fr...
4,r1zcsu,Having enough food and water.,msfr6b,confident,Working out eating right.,0.541680,gusf87s,"I'm not perfect, but I think it's important to...",gusb66v,Well i personally believe that true friends un...,gusf87s,trusting,"I'm not perfect, but I think it's important to...",gusb66v,confident,Well i personally believe that true friends un...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15207,hit:12318_conv:24637_uttr:1,"I don't like scammy sales people, nothing good...",87s0d7,trusting,Beware of crooked salesman that come to your door,0.621139,dwf2g1h,Aint no one home.,dwf39we,I really worry about elderly people in these k...,dwfm3te,angry,Filter Queen.,dwf2g1h,lonely,Aint no one home.
15208,hit:12384_conv:24768_uttr:1,"I almost got caught stealing, I felt pretty bad",hit:12314_conv:24629_uttr:1,guilty,I stole from my friend and i feel pretty bad a...,0.776447,hit:12314_conv:24629_uttr:2,Why did you steal stuff?,hit:12314_conv:24629_uttr:2,Why did you steal stuff?,hit:12314_conv:24629_uttr:2,questioning,Why did you steal stuff?,hit:12314_conv:24629_uttr:2,questioning,Why did you steal stuff?
15209,hit:12390_conv:24781_uttr:1,Some random dude walked into my room while I w...,hp5jdk,surprised,I woke up this morning with a random guy in my...,0.738668,fxmhlg4,That sounds so scary!,fxmhlg4,That sounds so scary!,fxmhlg4,afraid,That sounds so scary!,fxmhlg4,afraid,That sounds so scary!
15210,hit:12398_conv:24797_uttr:1,I saw my favourite candy at the store today!,odqprf,grateful,I went to the store and I was given sweets for...,0.652065,h42oe43,Receiving something for free is always a nice ...,h423vd3,Walking in to the store?,h42oe43,surprised,Receiving something for free is always a nice ...,h42oe43,surprised,Receiving something for free is always a nice ...


In [31]:
result_df.to_csv('../data/test/result/all/retrieval_result.csv', index = False)

# Generate Sample Responses

In [None]:
# N = 100
# sample_indices = np.random.choice(len(test_posts_indices), N, replace = False)

In [None]:
# np.save('../data/test/posts_indices_sample_idx.npy', sample_indices)
sample_indices = np.load('../data/test/posts_indices_sample_idx.npy')

In [None]:
print(test_posts_df.iloc[sample_indices[3]]['text'])
print(all_posts_df.iloc[cos_sim_top_idx_orig[sample_indices[3]]]['text'])

In [None]:
sample_result_dict = {}
sample_result_cols = ['post_id', 'post', 'most_sim_post_id', 'most_sim_post', 'cos_sim',
                      'highest_degree', 'hd_len', 'random', 'r_len']
for col in sample_result_cols:
    sample_result_dict[col] = []

## Rule 1&2: Pick Replies with Highest Degree & Pick Randomly

In [30]:
for i in tqdm(sample_indices):
    post_id = test_posts_df.iloc[i]['id']
    post = test_posts_df.iloc[i]['text']
    most_sim_post_idx = cos_sim_top_idx_orig[i]
    most_sim_post_id = all_posts_df.iloc[most_sim_post_idx]['id']
    most_sim_post = all_posts_df.iloc[most_sim_post_idx]['text']
    cos_sim_val = cos_sim[i,cos_sim_top_idx[i]]
    
    sample_result_dict['post_id'].append(post_id)
    sample_result_dict['post'].append(post)
    sample_result_dict['most_sim_post_id'].append(most_sim_post_id)
    sample_result_dict['most_sim_post'].append(most_sim_post)
    sample_result_dict['cos_sim'].append(cos_sim_val)

    if most_sim_post_idx in all_posts_clusters_dict:
        posts_idx = all_posts_clusters_dict[most_sim_post_idx]
    else:
        posts_idx = [most_sim_post_idx]

    posts_ids = all_posts_df.iloc[posts_idx]['id'].tolist()
    cand_replies_df = all_replies_df[all_replies_df['parent_id'].isin(posts_ids)].copy()

    cand_replies_df['degree'] = [reply_degrees[cand_replies_df.iloc[j]['id']] for j in range(cand_replies_df.shape[0])]
    
    cand_replies_df = cand_replies_df.sort_values(by = 'degree', ascending = False)
    
    d = cand_replies_df.iloc[0]['degree']
    j = 1
    while j < cand_replies_df.shape[0]:
        if cand_replies_df.iloc[j]['degree'] < d:
            break
        j += 1

    chosen_idx = np.random.choice(j, 1, replace = False)[0]
    chosen_idx_rand = np.random.choice(cand_replies_df.shape[0], 1, replace = False)[0]

    sample_result_dict['highest_degree'].append(cand_replies_df.iloc[chosen_idx]['text'])
    sample_result_dict['hd_len'].append(cand_replies_df.iloc[chosen_idx]['length'])
    sample_result_dict['random'].append(cand_replies_df.iloc[chosen_idx_rand]['text'])
    sample_result_dict['r_len'].append(cand_replies_df.iloc[chosen_idx_rand]['length'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 65.02it/s]


In [31]:
sample_result_df = pd.DataFrame(sample_result_dict)
sample_result_df

Unnamed: 0,post_id,post,most_sim_post_id,most_sim_post,cos_sim,highest_degree,hd_len,random,r_len
0,ckl2l0,Just went running with Grieg,nfp4eq,Started running yesterday,0.511821,Great job!,3,"As someone who used to run, when you hit your ...",21
1,dks03b,Unhappily married,f87qut,"Not married, no kids, first thing you'd assume...",0.587652,Cos they want to?,5,Cos they want to?,5
2,8vu116,Tell me something interesting about yourself!,fxqt3e,Tell me something interesting about you!,0.966009,I have a pretty flexible thumb,6,I have a pretty flexible thumb,6
3,mszzsi,I’m leaving my toxic workplace for good,lt9nam,I've just resigned from perhaps the most toxic...,0.841367,Congrats!,2,I too quit a toxic work environment last week.,10
4,6pm6vr,If you pay bills or do anything responsible wi...,8nwedb,Any leftover money disappears.,0.494538,"Pay off my credit card debt, and whatever I ca...",16,Cobble together the best PC I can buy with 500...,12
...,...,...,...,...,...,...,...,...,...
95,f8azbk,Today I tried to learn snowboarding,ek97h2,Snowboarding beginner,0.659287,Good luck.,3,"Have fun with it, and remember when you fall: ...",15
96,61lyx9,can anyone make me feel better about myself?,97p21e,Can anyone make me feel better?,0.744640,I hope you feel better soon!,7,"And, its totally okay to be sad.",9
97,7nooyq,So my ex just put a picture that I took of him...,7nrcgw,Do you purposely post things on social media t...,0.599413,Got better things to do.,6,Is the ex in question on social media apparent...,13
98,ba9bs9,I just felt like I wanted to share.,fe9mmm,I just thought it would be fun to share,0.767613,It just makes me happy when I smell like the o...,17,Not sure what your ancestry or height has to d...,13


In [32]:
sample_result_df.to_csv('../data/test/result/retrieval_result_sample.csv', index = False)

## Rule 3&4: Based on Emotion/Intent Labels

In [21]:
emp_intents = ['agreeing', 'acknowledging', 'encouraging', 'consoling',
               'sympathizing', 'suggesting', 'questioning', 'wishing']

In [28]:
emo_groups = [['prepared', 'confident', 'proud'], ['content', 'hopeful', 'anticipating'],
              ['joyful', 'excited'], ['caring'], ['faithful', 'trusting', 'grateful'],
              ['jealous', 'annoyed', 'angry', 'furious'], ['terrified', 'afraid', 'anxious', 'apprehensive'],
              ['disgusted'], ['ashamed', 'guilty', 'embarrassed'],
              ['devastated', 'sad', 'disappointed', 'nostalgic', 'lonely'],
              ['surprised'], ['impressed'], ['sentimental'], ['neutral'], ['agreeing', 'acknowledging'],
              ['encouraging'], ['consoling', 'sympathizing'], ['suggesting'], ['questioning'], ['wishing']]
emo_group_dict = {}
for g in emo_groups:
    for emo in g:
        emo_group_dict[emo] = [x for x in g if x != emo]
emo_group_dict

{'prepared': ['confident', 'proud'],
 'confident': ['prepared', 'proud'],
 'proud': ['prepared', 'confident'],
 'content': ['hopeful', 'anticipating'],
 'hopeful': ['content', 'anticipating'],
 'anticipating': ['content', 'hopeful'],
 'joyful': ['excited'],
 'excited': ['joyful'],
 'caring': [],
 'faithful': ['trusting', 'grateful'],
 'trusting': ['faithful', 'grateful'],
 'grateful': ['faithful', 'trusting'],
 'jealous': ['annoyed', 'angry', 'furious'],
 'annoyed': ['jealous', 'angry', 'furious'],
 'angry': ['jealous', 'annoyed', 'furious'],
 'furious': ['jealous', 'annoyed', 'angry'],
 'terrified': ['afraid', 'anxious', 'apprehensive'],
 'afraid': ['terrified', 'anxious', 'apprehensive'],
 'anxious': ['terrified', 'afraid', 'apprehensive'],
 'apprehensive': ['terrified', 'afraid', 'anxious'],
 'disgusted': [],
 'ashamed': ['guilty', 'embarrassed'],
 'guilty': ['ashamed', 'embarrassed'],
 'embarrassed': ['ashamed', 'guilty'],
 'devastated': ['sad', 'disappointed', 'nostalgic', 'lonely

In [29]:
sample_result_dict = {}
sample_result_cols = ['post_id', 'post', 'most_sim_post_id', 'most_sim_post', 'most_sim_post_emo', 'cos_sim',
                      'intent_emo', 'intent', 'follow_emo', 'follow']
for col in sample_result_cols:
    sample_result_dict[col] = []

In [30]:
for i in tqdm(sample_indices):
    post_id = test_posts_df.iloc[i]['id']
    post = test_posts_df.iloc[i]['text']
    most_sim_post_idx = cos_sim_top_idx_orig[i]
    most_sim_post_id = all_posts_df.iloc[most_sim_post_idx]['id']
    most_sim_post = all_posts_df.iloc[most_sim_post_idx]['text']
    most_sim_post_emo = all_posts_df.iloc[most_sim_post_idx]['emotion']
    cos_sim_val = cos_sim[i,cos_sim_top_idx[i]]
    
    sample_result_dict['post_id'].append(post_id)
    sample_result_dict['post'].append(post)
    sample_result_dict['most_sim_post_id'].append(most_sim_post_id)
    sample_result_dict['most_sim_post'].append(most_sim_post)
    sample_result_dict['most_sim_post_emo'].append(most_sim_post_emo)
    sample_result_dict['cos_sim'].append(cos_sim_val)

    # Find other posts in the same cluster as the most similar post
    if most_sim_post_idx in all_posts_clusters_dict:
        posts_idx = all_posts_clusters_dict[most_sim_post_idx]
    else:
        posts_idx = [most_sim_post_idx]

    posts_ids = all_posts_df.iloc[posts_idx]['id'].tolist()
    cand_replies_df = all_replies_df[all_replies_df['parent_id'].isin(posts_ids)].copy()

    cand_replies_intent_df = cand_replies_df[cand_replies_df['emotion'].isin(emp_intents)]
    if cand_replies_intent_df.shape[0] != 0:
        chosen_idx = np.random.choice(cand_replies_intent_df.shape[0], 1, replace = False)[0]
        reply_intent = cand_replies_intent_df.iloc[chosen_idx]['text']
        emo_intent = cand_replies_intent_df.iloc[chosen_idx]['emotion']
    else:
        chosen_idx = np.random.choice(cand_replies_df.shape[0], 1, replace = False)[0]
        reply_intent = cand_replies_df.iloc[chosen_idx]['text']
        emo_intent = cand_replies_df.iloc[chosen_idx]['emotion']
    
    cand_replies_same_emo_df = cand_replies_df[cand_replies_df['emotion'] == most_sim_post_emo].copy()
    cand_replies_sim_emo_df = cand_replies_df[cand_replies_df['emotion'].isin(emo_group_dict[most_sim_post_emo])].copy()
    if cand_replies_same_emo_df.shape[0] != 0:
        chosen_idx = np.random.choice(cand_replies_same_emo_df.shape[0], 1, replace = False)[0]
        reply_follow = cand_replies_same_emo_df.iloc[chosen_idx]['text']
        emo_follow = cand_replies_same_emo_df.iloc[chosen_idx]['emotion']
    elif cand_replies_sim_emo_df.shape[0] != 0:
        chosen_idx = np.random.choice(cand_replies_sim_emo_df.shape[0], 1, replace = False)[0]
        reply_follow = cand_replies_sim_emo_df.iloc[chosen_idx]['text']
        emo_follow = cand_replies_sim_emo_df.iloc[chosen_idx]['emotion']
    else:
        chosen_idx = np.random.choice(cand_replies_df.shape[0], 1, replace = False)[0]
        reply_follow = cand_replies_df.iloc[chosen_idx]['text']
        emo_follow = cand_replies_df.iloc[chosen_idx]['emotion']

    sample_result_dict['intent_emo'].append(emo_intent)
    sample_result_dict['intent'].append(reply_intent)
    sample_result_dict['follow_emo'].append(emo_follow)
    sample_result_dict['follow'].append(reply_follow)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 86.68it/s]


In [31]:
sample_result_df = pd.DataFrame(sample_result_dict)
sample_result_df

Unnamed: 0,post_id,post,most_sim_post_id,most_sim_post,most_sim_post_emo,cos_sim,intent_emo,intent,follow_emo,follow
0,ckl2l0,Just went running with Grieg,nfp4eq,Started running yesterday,joyful,0.511821,consoling,Keep going.,nostalgic,"As someone who used to run, when you hit your ..."
1,dks03b,Unhappily married,f87qut,"Not married, no kids, first thing you'd assume...",neutral,0.587652,questioning,Cos they want to?,suggesting,Well the beds thing might be to get a good nig...
2,8vu116,Tell me something interesting about yourself!,fxqt3e,Tell me something interesting about you!,questioning,0.966009,confident,I have a pretty flexible thumb,confident,I have a pretty flexible thumb
3,mszzsi,I’m leaving my toxic workplace for good,lt9nam,I've just resigned from perhaps the most toxic...,disgusted,0.841367,wishing,I wish you luck in your future !,disgusted,"Toxic people, toxic work, toxic evironment, etc."
4,6pm6vr,If you pay bills or do anything responsible wi...,8nwedb,Any leftover money disappears.,neutral,0.494538,suggesting,Given the time constraint I'd probably head to...,content,And the remainder goes in my gas tank
...,...,...,...,...,...,...,...,...,...,...
95,f8azbk,Today I tried to learn snowboarding,ek97h2,Snowboarding beginner,confident,0.659287,wishing,Good luck.,wishing,Good luck.
96,61lyx9,can anyone make me feel better about myself?,97p21e,Can anyone make me feel better?,questioning,0.744640,acknowledging,Aww.,questioning,"Damn, a bad cold managed to get you borderline..."
97,7nooyq,So my ex just put a picture that I took of him...,7nrcgw,Do you purposely post things on social media t...,jealous,0.599413,suggesting,Got better things to do.,confident,I do post stuff on social media so everyone *e...
98,ba9bs9,I just felt like I wanted to share.,fe9mmm,I just thought it would be fun to share,encouraging,0.767613,questioning,Not sure what your ancestry or height has to d...,questioning,Not sure what your ancestry or height has to d...


In [32]:
sample_result_df.to_csv('../data/test/result/sample/retrieval_result_2.csv', index = False)

# Miscellaneous

In [23]:
15212*120675*32/8/1024/1024/1024

6.838545575737953

In [24]:
a = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6]})
a

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [25]:
a[a['a'].isin([])]

Unnamed: 0,a,b
