In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import pickle

In [2]:
all_posts_df = pd.read_csv('../data/merged_q/all_posts_max_len_40.csv')
all_replies_df = pd.read_csv('../data/merged_q/all_replies_max_len_40.csv')

In [3]:
print('Avg speaker len:', np.mean(all_posts_df['length'].values))

Avg speaker len: 11.687843856431753


In [4]:
print('Avg listener len:', np.mean(all_replies_df['length'].values))

Avg listener len: 11.202826707678367


In [4]:
all_posts_df.head()

Unnamed: 0,id,summarized,text,root,length
0,r2wyka,0,Can you learn to be less afraid?,learn,8
1,r2wc2q,0,"Why doesn't Activia use the word ""yogurt"" on t...",use,14
2,r2w43k,0,Do you ever feel like everyone else has it tog...,feel,13
3,r2vbm3,0,Does pressing keyboard keys reflect your perso...,reflect,8
4,r2v7x0,0,My boss told me that during the time my hair w...,make,21


In [5]:
all_replies_df.head()

Unnamed: 0,id,parent_id,summarized,text,length
0,hm5iqwj,r2nmvy,1,Conscious Club is campy in a good way,8
1,hm5jftd,r2npa3,0,I take one to two hours a day where I listen t...,24
2,hm5kaqv,r2nmvy,0,I like Scary Pockets.,5
3,hm5kddu,r2nylj,0,what keeps me calm is my prescription medication,8
4,hm5kig9,r2nylj,1,Wow you're amazingly talented.,6


In [6]:
print(all_posts_df.shape, all_replies_df.shape)

(152680, 5) (838785, 5)


In [7]:
with open('../data/merged_q/all_posts_max_len_40_clusters/all_posts_clusters.pickle', 'rb') as f:
    all_posts_clusters = pickle.load(f)
print(len(all_posts_clusters[0.85]))

6902


In [8]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_combine_1_centroid.pickle', 'rb') as f:
    all_replies_clusters = pickle.load(f)
print(len(all_replies_clusters[0.8]))

35542


In [9]:
num = 0
a = []
for c in all_replies_clusters[0.8]:
    num += len(c)
    a += c
print(num, len(set(a)))
print(all_replies_df.shape[0] - num + len(all_replies_clusters[0.8]))

207740 207740
666587


In [10]:
num = 0
a = []
for c in all_posts_clusters[0.85]:
    num += len(c)
    a += c
print(num, len(set(a)))
print(all_posts_df.shape[0] - num + len(all_posts_clusters[0.85]))

25521 25521
134061


In [None]:
print('Number of post nodes: 134061')
print('Number of reply nodes: 666587')

In [14]:
post_idx_to_cluster_id = {}
for i, c in enumerate(all_posts_clusters[0.85]):
    for idx in c:
        post_idx_to_cluster_id[idx] = i
print(len(post_idx_to_cluster_id))

cluster_id = len(all_posts_clusters[0.85])
for idx in range(all_posts_df.shape[0]):
    if idx not in post_idx_to_cluster_id:
        post_idx_to_cluster_id[idx] = cluster_id
        cluster_id += 1
print(len(post_idx_to_cluster_id), cluster_id)

25521
152680 134061


In [15]:
reply_idx_to_cluster_id = {}
for i, c in enumerate(all_replies_clusters[0.8]):
    for idx in c:
        reply_idx_to_cluster_id[idx] = i
print(len(reply_idx_to_cluster_id))

cluster_id = len(all_replies_clusters[0.8])
for idx in range(all_replies_df.shape[0]):
    if idx not in reply_idx_to_cluster_id:
        reply_idx_to_cluster_id[idx] = cluster_id
        cluster_id += 1
print(len(reply_idx_to_cluster_id), cluster_id)

207740
838785 666587


In [16]:
post_id_to_post_idx = {}
for idx in range(all_posts_df.shape[0]):
    post_id = all_posts_df.iloc[idx]['id']
    post_id_to_post_idx[post_id] = idx

In [17]:
adj = np.zeros((134061, 666587), dtype=bool)

In [18]:
for reply_idx in tqdm(range(all_replies_df.shape[0])):
    post_id = all_replies_df.iloc[reply_idx]['parent_id']
    post_idx = post_id_to_post_idx[post_id]
    adj[post_idx_to_cluster_id[post_idx], reply_idx_to_cluster_id[reply_idx]] = 1

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 838785/838785 [00:51<00:00, 16299.44it/s]


In [19]:
print(np.sum(adj))

804700


In [21]:
print('Avg degree of speaker nodes:', np.mean(np.sum(adj, axis = 1)))

Avg degree of speaker nodes: 6.002491403167215


In [22]:
print('Avg degree of listener nodes:', np.mean(np.sum(adj, axis = 0)))

Avg degree of listener nodes: 1.207194259714036


# Profanity Check

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
from profanity_check import predict, predict_prob



In [2]:
all_posts_df = pd.read_csv('../data/merged_q/all_posts_max_len_40.csv')
all_replies_df = pd.read_csv('../data/merged_q/all_replies_max_len_40.csv')

In [3]:
profanity_idx_speaker = []
for i in tqdm(range(all_posts_df.shape[0])):
    profanity = predict([all_posts_df.iloc[i]['text']])
    if profanity == 1:
        profanity_idx_speaker.append(i)
print(len(profanity_idx_speaker), all_posts_df.shape[0])

100%|██████████| 152680/152680 [02:15<00:00, 1127.82it/s]

2526 152680





In [4]:
np.save('../data/merged_q/profanity_check/profanity_idx_speaker.npy', np.array(profanity_idx_speaker))

In [5]:
all_posts_df.iloc[profanity_idx_speaker]

Unnamed: 0,id,summarized,text,root,length
37,r20yh9,0,I suck at talking,suck,4
51,r1tw9i,0,Seasonal depression is kicking my ass,kick,6
62,r1ocla,0,"Holy shit, I’m going to therapy for the first ...",go,15
180,qz47mf,1,Now I feel stupid,feel,4
208,qync94,1,I’m a full ass grown adult and moved back home...,be,24
...,...,...,...,...,...
151332,hit:11928_conv:23856_uttr:1,0,"... starving once again, this feeling sucks, i...",get,25
151382,hit:12165_conv:24330_uttr:1,0,My neighbor sits out on his balcony butt naked...,sit,11
151592,hit:2150_conv:4301_uttr:1,1,I always say i hate when other moms shame each...,say,15
151604,hit:2542_conv:5084_uttr:1,0,My aggressive pet rat attacked the baby rat an...,attack,18


In [6]:
profanity_idx_listener = []
for i in tqdm(range(all_replies_df.shape[0])):
    profanity = predict([all_replies_df.iloc[i]['text']])
    if profanity == 1:
        profanity_idx_listener.append(i)
print(len(profanity_idx_listener), all_replies_df.shape[0])

100%|██████████| 838785/838785 [12:17<00:00, 1136.88it/s]

22024 838785





In [7]:
np.save('../data/merged_q/profanity_check/profanity_idx_listener.npy', np.array(profanity_idx_listener))

In [7]:
22024/838785

0.026257026532424878

In [8]:
2526/152680

0.01654440660204349

## Calculate New Nodes and Edges

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

In [2]:
all_posts_df = pd.read_csv('../data/merged_q/all_posts_max_len_40.csv')
all_replies_df = pd.read_csv('../data/merged_q/all_replies_max_len_40.csv')

In [3]:
profanity_idx_speaker = np.load('../data/merged_q/profanity_check/profanity_idx_speaker.npy')
profanity_idx_listener = np.load('../data/merged_q/profanity_check/profanity_idx_listener.npy')

In [None]:
# cache = set(profanity_idx_speaker)
# post_idx_filtered = [i for i in range(all_posts_df.shape[0]) if i not in cache]
# print(len(post_idx_filtered))

In [4]:
cache = set(profanity_idx_listener)
profane_speaker_id = set([all_posts_df.iloc[i]['id'] for i in profanity_idx_speaker])
reply_idx_filtered = []
for i in tqdm(range(all_replies_df.shape[0])):
    if i in cache or all_replies_df.iloc[i]['parent_id'] in profane_speaker_id:
        continue
    reply_idx_filtered.append(i)
print(len(reply_idx_filtered))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 838785/838785 [00:49<00:00, 16851.54it/s]

803320





In [5]:
cache = set(profanity_idx_speaker)
good_speaker_id = set(all_replies_df.iloc[reply_idx_filtered]['parent_id'].tolist())
post_idx_filtered = []
for i in tqdm(range(all_posts_df.shape[0])):
    if i in cache:
        continue
    if all_posts_df.iloc[i]['id'] not in good_speaker_id:
        continue
    post_idx_filtered.append(i)
print(len(post_idx_filtered))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 152680/152680 [00:09<00:00, 16550.87it/s]

149332





In [None]:
# np.save('../data/merged_q/profanity_check/post_idx_filtered.npy', np.array(post_idx_filtered))
# np.save('../data/merged_q/profanity_check/reply_idx_filtered.npy', np.array(reply_idx_filtered))

In [None]:
# post_df_filtered = all_posts_df.iloc[post_idx_filtered]
# post_df_filtered

In [None]:
# reply_df_filtered = all_replies_df.iloc[reply_idx_filtered]
# reply_df_filtered

In [6]:
with open('../data/merged_q/all_posts_max_len_40_clusters/all_posts_clusters.pickle', 'rb') as f:
    all_posts_clusters = pickle.load(f)
print(len(all_posts_clusters[0.85]))

6902


In [7]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_combine_1_centroid.pickle', 'rb') as f:
    all_replies_clusters = pickle.load(f)
print(len(all_replies_clusters[0.8]))

35542


In [8]:
post_cache = set(post_idx_filtered)
post_clusters = []
for c in tqdm(all_posts_clusters[0.85]):
    cc = [idx for idx in c if idx in post_cache]
    if len(cc) > 0:
        post_clusters.append(cc)
print(len(post_clusters))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6902/6902 [00:00<00:00, 988698.30it/s]

6842





In [9]:
reply_cache = set(reply_idx_filtered)
reply_clusters = []
for c in tqdm(all_replies_clusters[0.8]):
    cc = [idx for idx in c if idx in reply_cache]
    if len(cc) > 0:
        reply_clusters.append(cc)
print(len(reply_clusters))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35542/35542 [00:00<00:00, 548459.20it/s]

35067





In [10]:
post_idx_to_cluster_id = {}
for i, c in enumerate(post_clusters):
    for idx in c:
        post_idx_to_cluster_id[idx] = i

cluster_id = len(post_clusters)
for idx in range(all_posts_df.shape[0]):
    if idx not in post_cache:
        continue
    if idx not in post_idx_to_cluster_id:
        post_idx_to_cluster_id[idx] = cluster_id
        cluster_id += 1
print(cluster_id, '<-', len(post_idx_to_cluster_id), len(post_cache))

131038 <- 149332 149332


In [11]:
print(149332 - sum(len(c) for c in post_clusters) + len(post_clusters))

131038


In [12]:
reply_idx_to_cluster_id = {}
for i, c in enumerate(reply_clusters):
    for idx in c:
        reply_idx_to_cluster_id[idx] = i

cluster_id = len(reply_clusters)
for idx in range(all_replies_df.shape[0]):
    if idx not in reply_cache:
        continue
    if idx not in reply_idx_to_cluster_id:
        reply_idx_to_cluster_id[idx] = cluster_id
        cluster_id += 1
print(cluster_id, '<-', len(reply_idx_to_cluster_id), len(reply_cache))

637628 <- 803320 803320


In [13]:
print(803320 - sum(len(c) for c in reply_clusters) + len(reply_clusters))

637628


In [14]:
post_id_to_post_idx = {}
for idx in tqdm(range(all_posts_df.shape[0])):
    if idx not in post_cache:
        continue
    post_id = all_posts_df.iloc[idx]['id']
    post_id_to_post_idx[post_id] = idx

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 152680/152680 [00:09<00:00, 16714.67it/s]


In [15]:
adj = np.zeros((131038, 637628), dtype=bool)

In [16]:
for reply_idx in tqdm(range(all_replies_df.shape[0])):
    if reply_idx not in reply_cache:
        continue
    post_id = all_replies_df.iloc[reply_idx]['parent_id']
    post_idx = post_id_to_post_idx[post_id]
    adj[post_idx_to_cluster_id[post_idx], reply_idx_to_cluster_id[reply_idx]] = 1

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 838785/838785 [00:50<00:00, 16620.62it/s]


In [17]:
print('Number of edges:', np.sum(adj))

Number of edges: 770192


In [18]:
print('Avg degree of speaker nodes:', np.mean(np.sum(adj, axis = 1)))

Avg degree of speaker nodes: 5.87762328484867


In [19]:
print('Avg degree of listener nodes:', np.mean(np.sum(adj, axis = 0)))

Avg degree of listener nodes: 1.2079017859943415


In [None]:
print('Number of speaker nodes: 131038')
print('Number of listener nodes: 637628')