In [1]:
import os
import pickle
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
all_posts_df = pd.read_csv('../data/merged_q/all_posts_max_len_40.csv')
all_replies_df = pd.read_csv('../data/merged_q/all_replies_max_len_40.csv')
print('all_posts_df shape = {}'.format(all_posts_df.shape))
print('all_replies_df shape = {}'.format(all_replies_df.shape))

all_posts_df shape = (152680, 5)
all_replies_df shape = (838785, 5)


In [3]:
all_posts_embed = np.load('../data/merged_q/all_posts_max_len_40_embed.npy')
all_replies_embed = np.load('../data/merged_q/all_replies_max_len_40_embed.npy')
print('all_posts_embed shape = {}'.format(all_posts_embed.shape))
print('all_replies_embed shape = {}'.format(all_replies_embed.shape))

all_posts_embed shape = (152680, 768)
all_replies_embed shape = (838785, 768)


## Select out ED Testing Data

In [4]:
all_posts_df

Unnamed: 0,id,summarized,text,root,length
0,r2wyka,0,Can you learn to be less afraid?,learn,8
1,r2wc2q,0,"Why doesn't Activia use the word ""yogurt"" on t...",use,14
2,r2w43k,0,Do you ever feel like everyone else has it tog...,feel,13
3,r2vbm3,0,Does pressing keyboard keys reflect your perso...,reflect,8
4,r2v7x0,0,My boss told me that during the time my hair w...,make,21
...,...,...,...,...,...
152675,hit:12318_conv:24637_uttr:1,0,"I don't like scammy sales people, nothing good...",come,13
152676,hit:12384_conv:24768_uttr:1,0,"I almost got caught stealing, I felt pretty bad",feel,10
152677,hit:12390_conv:24781_uttr:1,0,Some random dude walked into my room while I w...,walk,19
152678,hit:12398_conv:24797_uttr:1,1,I saw my favourite candy at the store today!,see,10


In [5]:
ed_test_df = pd.read_csv('../data/ed/raw/test.csv')
ed_test_df

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:0,1,guilty,I felt guilty when I was driving home one nigh...,0,Yeah about 10 years ago I had a horrifying exp...,2|2|5_5|5|5,
1,hit:0_conv:0,2,guilty,I felt guilty when I was driving home one nigh...,1,Did you suffer any injuries?,2|2|5_5|5|5,
2,hit:0_conv:0,3,guilty,I felt guilty when I was driving home one nigh...,0,No I wasn't hit. It turned out they were drunk...,2|2|5_5|5|5,
3,hit:0_conv:0,4,guilty,I felt guilty when I was driving home one nigh...,1,Why did you feel guilty? People really shouldn...,2|2|5_5|5|5,
4,hit:0_conv:0,5,guilty,I felt guilty when I was driving home one nigh...,0,I don't know I was new to driving and hadn't e...,2|2|5_5|5|5,
...,...,...,...,...,...,...,...,...
10864,hit:12416_conv:24832,4,disgusted,I saw a huge cockroach outside my house today....,46,I live in Texas to so i know those feels,5|5|5_4|3|4,
10865,hit:12423_conv:24847,1,anxious,I have a big test on Monday. I am so nervous_c...,481,I have a big test on Monday_comma_ I am so ner...,5|5|5_5|5|5,
10866,hit:12423_conv:24847,2,anxious,I have a big test on Monday. I am so nervous_c...,375,What is the test on?,5|5|5_5|5|5,
10867,hit:12423_conv:24847,3,anxious,I have a big test on Monday. I am so nervous_c...,481,It's for my Chemistry class. I haven't slept m...,5|5|5_5|5|5,


In [6]:
ed_test_ids = [x + '_uttr:1' for x in ed_test_df['conv_id'].tolist()]
ed_test_posts_df = all_posts_df[all_posts_df['id'].isin(ed_test_ids)]
ed_test_posts_df

Unnamed: 0,id,summarized,text,root,length
151405,hit:40_conv:81_uttr:1,0,I couldn't wait to go to the concert.,wait,10
151406,hit:104_conv:208_uttr:1,0,i just moved to this neighborhood and some dum...,move,21
151407,hit:136_conv:273_uttr:1,1,So yeah i drove 2 hours to get to wallmart.,drive,11
151408,hit:142_conv:285_uttr:1,1,My friend's boyfriend recently made a pass at me.,make,11
151409,hit:143_conv:286_uttr:1,1,my baby is sleeping.,sleep,5
...,...,...,...,...,...
152675,hit:12318_conv:24637_uttr:1,0,"I don't like scammy sales people, nothing good...",come,13
152676,hit:12384_conv:24768_uttr:1,0,"I almost got caught stealing, I felt pretty bad",feel,10
152677,hit:12390_conv:24781_uttr:1,0,Some random dude walked into my room while I w...,walk,19
152678,hit:12398_conv:24797_uttr:1,1,I saw my favourite candy at the store today!,see,10


In [7]:
ed_test_posts_ids = ed_test_posts_df['id'].tolist()
print(ed_test_posts_ids[:10])
print('len(ed_test_posts_ids) = {}'.format(len(ed_test_posts_ids)))

['hit:40_conv:81_uttr:1', 'hit:104_conv:208_uttr:1', 'hit:136_conv:273_uttr:1', 'hit:142_conv:285_uttr:1', 'hit:143_conv:286_uttr:1', 'hit:147_conv:294_uttr:1', 'hit:171_conv:342_uttr:1', 'hit:173_conv:346_uttr:1', 'hit:175_conv:351_uttr:1', 'hit:207_conv:414_uttr:1']
len(ed_test_posts_ids) = 1275


## Select out 10% of Reddit

In [8]:
all_reddit_posts_df = all_posts_df[~all_posts_df['id'].str.startswith('hit:')]
all_reddit_posts_df

Unnamed: 0,id,summarized,text,root,length
0,r2wyka,0,Can you learn to be less afraid?,learn,8
1,r2wc2q,0,"Why doesn't Activia use the word ""yogurt"" on t...",use,14
2,r2w43k,0,Do you ever feel like everyone else has it tog...,feel,13
3,r2vbm3,0,Does pressing keyboard keys reflect your perso...,reflect,8
4,r2v7x0,0,My boss told me that during the time my hair w...,make,21
...,...,...,...,...,...
139373,50mj2d,0,I kind of hate today's rap music,hate,8
139374,4ze20k,1,"I can't watch shows like ""The Office"" because ...",watch,19
139375,4ypvyx,1,"For me, I usually get a small thing of regular...",get,33
139376,4vkgnz,0,Tell me what to draw!,tell,6


In [9]:
N = all_reddit_posts_df.shape[0]
N_test = N // 10
test_indices = np.random.choice(N, N_test, replace = False)
print('len(test_indices) = {}'.format(len(test_indices)))

len(test_indices) = 13937


In [10]:
reddit_test_posts_df = all_reddit_posts_df.iloc[test_indices]
reddit_test_posts_df

Unnamed: 0,id,summarized,text,root,length
76609,aq3i5j,1,I was just told that I apparently give really ...,tell,13
22839,igtkhr,0,Does anyone want to argue about nonsense for fun?,want,10
47602,evisbc,1,Just wanted to see if anyone would wanna play ...,want,17
116385,7d547p,0,How do you overcome stage fright ?,overcome,7
115195,7g2yvn,0,cracking phone screens,crack,3
...,...,...,...,...,...
139248,5ln2kk,0,Today I was told by a boy in my class that my ...,tell,39
95651,8wxlqx,0,I got kicked out of a club today,kick,8
122856,6zv1bf,0,Gaming pc recommendations?,game,4
58973,d65640,0,I just missed an opportunity to kiss a girl.,miss,10


In [11]:
reddit_test_posts_ids = reddit_test_posts_df['id'].tolist()
print(reddit_test_posts_ids[:10])
print('len(reddit_test_posts_ids) = {}'.format(len(reddit_test_posts_ids)))

['aq3i5j', 'igtkhr', 'evisbc', '7d547p', '7g2yvn', 'outioh', '6vl72e', 'a7sg8w', 'a4kskn', 'cnmomt']
len(reddit_test_posts_ids) = 13937


## Combine

In [12]:
rows = test_indices.tolist() + list(range(151405, 152680))
print('total number of testing points: {}'.format(len(rows)))

total number of testing points: 15212


In [13]:
np.save('../data/test/posts_indices.npy', np.array(rows))

## Create Test DF

In [14]:
test_posts_df = all_posts_df.iloc[rows]
test_posts_df

Unnamed: 0,id,summarized,text,root,length
76609,aq3i5j,1,I was just told that I apparently give really ...,tell,13
22839,igtkhr,0,Does anyone want to argue about nonsense for fun?,want,10
47602,evisbc,1,Just wanted to see if anyone would wanna play ...,want,17
116385,7d547p,0,How do you overcome stage fright ?,overcome,7
115195,7g2yvn,0,cracking phone screens,crack,3
...,...,...,...,...,...
152675,hit:12318_conv:24637_uttr:1,0,"I don't like scammy sales people, nothing good...",come,13
152676,hit:12384_conv:24768_uttr:1,0,"I almost got caught stealing, I felt pretty bad",feel,10
152677,hit:12390_conv:24781_uttr:1,0,Some random dude walked into my room while I w...,walk,19
152678,hit:12398_conv:24797_uttr:1,1,I saw my favourite candy at the store today!,see,10


In [15]:
test_posts_df.to_csv('../data/test/posts.csv', index = False)

In [17]:
test_replies_df = pd.DataFrame()

In [18]:
for post_id in tqdm(test_posts_df['id'].tolist()):
    df = all_replies_df[all_replies_df['parent_id'] == post_id]
    assert df.shape[0] > 0
    test_replies_df = pd.concat([test_replies_df, df])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15212/15212 [09:32<00:00, 26.57it/s]


In [19]:
test_replies_df

Unnamed: 0,id,parent_id,summarized,text,length
439033,egd6ti7,aq3i5j,1,Thanks.,2
439034,egd78yr,aq3i5j,1,Boundaries advice i guess?,5
439043,egda9u6,aq3i5j,1,Practically never start.,4
439098,ege6pa6,aq3i5j,1,How to not feel inferior?,6
141011,g2vwz9z,igtkhr,1,Why not paint them white or a bright yellow?,10
...,...,...,...,...,...
838780,hit:12318_conv:24637_uttr:2,hit:12318_conv:24637_uttr:1,0,"I agree, were you recently screwed by one?",10
838781,hit:12384_conv:24768_uttr:2,hit:12384_conv:24768_uttr:1,1,Ouch!,2
838782,hit:12390_conv:24781_uttr:2,hit:12390_conv:24781_uttr:1,1,Oh wow that is weird.,6
838783,hit:12398_conv:24797_uttr:2,hit:12398_conv:24797_uttr:1,1,Oh nice.,3


In [20]:
test_replies_df.to_csv('../data/test/replies.csv', index = False)

In [21]:
all_replies_ids = all_replies_df['id'].tolist()
test_replies_ids = []
for reply_id in tqdm(test_replies_df['id'].tolist()):
    test_replies_ids.append(all_replies_ids.index(reply_id))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82284/82284 [06:09<00:00, 222.82it/s]


In [22]:
print(len(test_replies_ids), len(set(test_replies_ids)))

82284 82284


In [23]:
np.save('../data/test/replies_indices.npy', np.array(test_replies_ids))

## Sort

In [4]:
post_indices = np.load('../data/test/post_indices.npy')
post_indices = np.sort(post_indices)
print(post_indices.shape)

(15212,)


In [5]:
test_posts_df = all_posts_df.iloc[post_indices]
test_posts_df

Unnamed: 0,id,summarized,text,root,length
2,r2w43k,0,Do you ever feel like everyone else has it tog...,feel,13
16,r2qqyz,0,I've finally figured out the answer to the que...,figure,21
28,r2doqc,0,I cooked the entree for my Friendsgiving for t...,cook,14
30,r2cedp,1,"At my job where I work with kids 5-13, I start...",start,23
42,r1zcsu,1,Having enough food and water.,have,6
...,...,...,...,...,...
152675,hit:12318_conv:24637_uttr:1,0,"I don't like scammy sales people, nothing good...",come,13
152676,hit:12384_conv:24768_uttr:1,0,"I almost got caught stealing, I felt pretty bad",feel,10
152677,hit:12390_conv:24781_uttr:1,0,Some random dude walked into my room while I w...,walk,19
152678,hit:12398_conv:24797_uttr:1,1,I saw my favourite candy at the store today!,see,10


In [6]:
np.save('../data/test/posts_indices.npy', post_indices)
test_posts_df.to_csv('../data/test/posts.csv', index = False)

-----

In [7]:
reply_indices = np.load('../data/test/reply_indices.npy')
reply_indices = np.sort(reply_indices)
print(reply_indices.shape)

(82284,)


In [8]:
test_replies_df = all_replies_df.iloc[reply_indices]
test_replies_df

Unnamed: 0,id,parent_id,summarized,text,length
16,hm61y7k,r2qqyz,0,"Man, I wouldn't know where to get started.",11
17,hm63dh1,r2qqyz,1,"Having said that, The Nice Guys would always b...",14
61,hm70wwb,r2w43k,1,I hope this kinda helps.,6
68,hm7608z,r2w43k,1,One of the gifts of middle age is that I have ...,18
120,hm49qhc,r2doqc,1,Happy Thanksgiving :),3
...,...,...,...,...,...
838780,hit:12318_conv:24637_uttr:2,hit:12318_conv:24637_uttr:1,0,"I agree, were you recently screwed by one?",10
838781,hit:12384_conv:24768_uttr:2,hit:12384_conv:24768_uttr:1,1,Ouch!,2
838782,hit:12390_conv:24781_uttr:2,hit:12390_conv:24781_uttr:1,1,Oh wow that is weird.,6
838783,hit:12398_conv:24797_uttr:2,hit:12398_conv:24797_uttr:1,1,Oh nice.,3


In [9]:
np.save('../data/test/replies_indices.npy', reply_indices)
test_replies_df.to_csv('../data/test/replies.csv', index = False)

# Generate Test CSV with 41 Labels

In [2]:
all_posts_df = pd.read_csv('../data/merged_q/all_posts_max_len_40_labeled.csv')
all_replies_df = pd.read_csv('../data/merged_q/all_replies_max_len_40_labeled.csv')
print('all_posts_df shape = {}'.format(all_posts_df.shape))
print('all_replies_df shape = {}'.format(all_replies_df.shape))

all_posts_df shape = (152680, 6)
all_replies_df shape = (838785, 6)


In [4]:
post_indices = np.load('../data/test/posts_indices.npy')
print(post_indices.shape)

(15212,)


In [5]:
test_posts_df = all_posts_df.iloc[post_indices]
test_posts_df

Unnamed: 0,id,summarized,text,root,length,emotion
2,r2w43k,0,Do you ever feel like everyone else has it tog...,feel,13,questioning
16,r2qqyz,0,I've finally figured out the answer to the que...,figure,21,questioning
28,r2doqc,0,I cooked the entree for my Friendsgiving for t...,cook,14,proud
30,r2cedp,1,"At my job where I work with kids 5-13, I start...",start,23,prepared
42,r1zcsu,1,Having enough food and water.,have,6,prepared
...,...,...,...,...,...,...
152675,hit:12318_conv:24637_uttr:1,0,"I don't like scammy sales people, nothing good...",come,13,trusting
152676,hit:12384_conv:24768_uttr:1,0,"I almost got caught stealing, I felt pretty bad",feel,10,guilty
152677,hit:12390_conv:24781_uttr:1,0,Some random dude walked into my room while I w...,walk,19,surprised
152678,hit:12398_conv:24797_uttr:1,1,I saw my favourite candy at the store today!,see,10,excited


In [6]:
test_posts_df.to_csv('../data/test/posts_labeled_41.csv', index = False)

---

In [7]:
reply_indices = np.load('../data/test/replies_indices.npy')
print(reply_indices.shape)

(82284,)


In [8]:
test_replies_df = all_replies_df.iloc[reply_indices]
test_replies_df

Unnamed: 0,id,parent_id,summarized,text,length,emotion
16,hm61y7k,r2qqyz,0,"Man, I wouldn't know where to get started.",11,acknowledging
17,hm63dh1,r2qqyz,1,"Having said that, The Nice Guys would always b...",14,agreeing
61,hm70wwb,r2w43k,1,I hope this kinda helps.,6,encouraging
68,hm7608z,r2w43k,1,One of the gifts of middle age is that I have ...,18,jealous
120,hm49qhc,r2doqc,1,Happy Thanksgiving :),3,wishing
...,...,...,...,...,...,...
838780,hit:12318_conv:24637_uttr:2,hit:12318_conv:24637_uttr:1,0,"I agree, were you recently screwed by one?",10,questioning
838781,hit:12384_conv:24768_uttr:2,hit:12384_conv:24768_uttr:1,1,Ouch!,2,disgusted
838782,hit:12390_conv:24781_uttr:2,hit:12390_conv:24781_uttr:1,1,Oh wow that is weird.,6,surprised
838783,hit:12398_conv:24797_uttr:2,hit:12398_conv:24797_uttr:1,1,Oh nice.,3,acknowledging


In [9]:
test_replies_df.to_csv('../data/test/replies_labeled_41.csv', index = False)