In [1]:
import os
import csv
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
# import networkx as nx
# from pyvis.network import Network

In [2]:
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')

# Read Data and Encode

In [3]:
all_posts_df = pd.read_csv('../data/merged_q/all_posts_max_len_40.csv')
all_replies_df = pd.read_csv('../data/merged_q/all_replies_max_len_40.csv')
print('# Posts: {}, # Replies: {}'.format(all_posts_df.shape[0], all_replies_df.shape[0]))

# Posts: 152680, # Replies: 838785


In [4]:
all_posts_text = all_posts_df['text'].tolist()
all_replies_text = all_replies_df['text'].tolist()

In [5]:
all_posts_embed = model.encode(all_posts_text, batch_size = 128, show_progress_bar = True, convert_to_numpy = True)

Batches:   0%|          | 0/1193 [00:00<?, ?it/s]

In [6]:
all_posts_embed.shape

(152680, 768)

In [7]:
all_replies_embed = model.encode(all_replies_text, batch_size = 128, show_progress_bar = True, convert_to_numpy = True)

Batches:   0%|          | 0/6554 [00:00<?, ?it/s]

In [8]:
all_replies_embed.shape

(838785, 768)

In [9]:
np.save('../data/merged_q/all_posts_max_len_40_embed.npy', all_posts_embed)
np.save('../data/merged_q/all_replies_max_len_40_embed.npy', all_replies_embed)

# Clustering

In [7]:
def community_detection(embeddings, threshold=0.75, min_community_size=10, init_max_size=1000):
    """
    Function for Fast Community Detection
    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.
    """

    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

In [10]:
# all_events_clusters = community_detection(all_events_embed, min_community_size=2, threshold=0.8)
all_posts_clusters = util.community_detection(all_posts_embed, min_community_size=2, threshold=0.8)

In [11]:
# with open('all_events_clusters.pickle', 'rb') as f:
#     all_events_clusters = pickle.load(f)
with open('../data/merged_q/all_posts_max_len_10_clusters.pickle', 'wb') as f:
    pickle.dump(all_posts_clusters, f)

In [13]:
len(all_posts_clusters)

4786

In [10]:
# for i, cluster in enumerate(all_events_clusters):
#     print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
#     for j in cluster:
#         print("\t", all_events_df.iloc[j]['text'])

In [12]:
# all_replies_clusters = community_detection(all_replies_embed, min_community_size=2, threshold=0.8)

In [11]:
with open('all_replies_clusters.pickle', 'rb') as f:
    all_replies_clusters = pickle.load(f)

In [12]:
len(all_replies_clusters)

8316

In [13]:
# for i, cluster in enumerate(all_replies_clusters):
#     print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
#     for j in cluster:
#         print("\t", all_replies_df.iloc[j]['text'])

In [18]:
num_posts_clustered = 0
for i in all_posts_clusters:
    num_posts_clustered += len(i)
print('Num posts clustered = {}'.format(num_posts_clustered))
print('Remaining num posts = {}'.format(all_posts_df.shape[0] - num_posts_clustered))
print('Total num post nodes = {}'.format(all_posts_df.shape[0] - num_posts_clustered + len(all_posts_clusters)))

Num posts clustered = 20798
Remaining num posts = 44219
Total num post nodes = 49005


In [14]:
cluster_post_ids = []
for i in all_posts_clusters:
    tmp_list = []
    for j in i:
        tmp_list.append(all_posts_df.iloc[j]['id'])
    cluster_post_ids.append(tmp_list)

all_posts_clusters_dict = {'cluster_id': ['post_cluster_' + str(i) for i in range(len(all_posts_clusters))],
                           'text': [all_posts_df.iloc[i[0]]['text'] for i in all_posts_clusters],
                           'post_id': cluster_post_ids,
                           'num_posts': [len(i) for i in all_posts_clusters]}

In [15]:
all_posts_clusters_df = pd.DataFrame(all_posts_clusters_dict)

In [16]:
all_posts_clusters_df

Unnamed: 0,cluster_id,text,post_id,num_posts
0,post_cluster_0,How is everyone’s day going?,"[8voq6u, fi7zg1, 638arm, 7555f1, aem2mz, ay8bi...",268
1,post_cluster_1,Does anybody want to talk?,"[745ahc, 8rpitj, 8ccu0s, 5xr44k, 6pblh9, aouvy...",197
2,post_cluster_2,Does anyone else ever feel the same?,"[8mbot2, 5t4cq2, 9gc6r1, 9l0t51, fxo8kh, ijkrs...",191
3,post_cluster_3,I finally got my first job!,"[benn1f, qq5ofe, dk5d7u, ilivj9, fbiyqx, hficf...",164
4,post_cluster_4,Let's chat.,"[6tivnq, 6zqggi, 6s1shl, mkgp0e, eyffxr, 5ou2u...",119
...,...,...,...,...
4781,post_cluster_4781,I watched Wimbledon for the first time last su...,"[hit:7499_conv:14999_uttr:1, hit:9923_conv:198...",2
4782,post_cluster_4782,Do you feel anything like that,"[hit:7546_conv:15093_uttr:1, 7b0emb]",2
4783,post_cluster_4783,I love volunteering.,"[hit:10706_conv:21412_uttr:1, hit:10708_conv:2...",2
4784,post_cluster_4784,I have been feeling great about life recently.,"[hit:7261_conv:14522_uttr:1, fa34o4]",2


In [17]:
all_posts_clusters_df.to_csv('../data/merged_q/all_posts_max_len_10_clusters.csv', index = False)

In [18]:
cluster_reply_ids = []
for i in all_replies_clusters:
    tmp_list = []
    for j in i:
        tmp_list.append(all_replies_df.iloc[j]['id'])
    cluster_reply_ids.append(tmp_list)
    
all_replies_clusters_dict = {'cluster_id': ['reply_cluster_' + str(i) for i in range(len(all_replies_clusters))],
                           'text': [all_replies_df.iloc[i[0]]['text'] for i in all_replies_clusters],
                           'reply_id': cluster_reply_ids,
                           'num_replies': [len(i) for i in all_replies_clusters]}

In [19]:
all_replies_clusters_df = pd.DataFrame(all_replies_clusters_dict)

In [20]:
all_replies_clusters_df

Unnamed: 0,cluster_id,text,reply_id,num_replies
0,reply_cluster_0,Congratulations!,"[gwgluk8, gwhavf3, gwd1yno, gweddd4, gw90rgv, ...",5575
1,reply_cluster_1,Good luck!,"[gwgfu44, gvnwix1, gvo1yc1, gvoaf4f, gvol5bj, ...",2043
2,reply_cluster_2,Happy birthday to you.,"[gwdw9bm, gw4myxl, gw55osc, gtnfy5r, gtng3kr, ...",1348
3,reply_cluster_3,Good job!,"[gwg7r2e, gwgcl7k, gw51stz, gvirk7h, gvivplo, ...",1223
4,reply_cluster_4,oh yes.,"[eyu2h3l, etgtnqs, fo1y2bv, drxzbb6, gtrx75r, ...",849
...,...,...,...,...
8311,reply_cluster_8311,Were you able to find someone?,"[hit:11488_conv:22977_uttr:2, fshzh4g]",2
8312,reply_cluster_8312,Did you study for it?,"[hit:11805_conv:23610_uttr:2, hit:2542_conv:50...",2
8313,reply_cluster_8313,How long has she been gone?,"[hit:4299_conv:8599_uttr:2, dw3sj23]",2
8314,reply_cluster_8314,"yuck, how gross!!","[hit:11345_conv:22691_uttr:2, hit:3471_conv:69...",2


In [35]:
# all_replies_clusters_df.to_csv('all_replies_clusters.csv', index = False)

# Visualization

In [21]:
pair_dict = {}
for i in range(len(all_replies_df)):
    event_id = all_replies_df.iloc[i]['parent_id']
    reply_id = all_replies_df.iloc[i]['id']
    if event_id in pair_dict:
        pair_dict[event_id].append(reply_id)
    else:
        pair_dict[event_id] = [reply_id]

In [22]:
event_to_cluster_dict = {}
for i in range(len(all_events_clusters_df)):
    event_cluster_id = all_events_clusters_df.iloc[i]['cluster_id']
    event_cluster_event_ids = all_events_clusters_df.iloc[i]['event_id']
    for j in event_cluster_event_ids:
        event_to_cluster_dict[j] = event_cluster_id

In [23]:
len(event_to_cluster_dict)

11579

In [25]:
reply_to_cluster_dict = {}
for i in range(len(all_replies_clusters_df)):
    reply_cluster_id = all_replies_clusters_df.iloc[i]['cluster_id']
    reply_cluster_reply_ids = all_replies_clusters_df.iloc[i]['reply_id']
    for j in reply_cluster_reply_ids:
        reply_to_cluster_dict[j] = reply_cluster_id

In [26]:
len(reply_to_cluster_dict)

58691

In [27]:
cluster_connection_mat = np.zeros((len(all_events_clusters_df), len(all_replies_clusters_df)))

In [28]:
for i in tqdm(range(len(all_replies_df))):
    event_id = all_replies_df.iloc[i]['parent_id']
    reply_id = all_replies_df.iloc[i]['id']
    if event_id in event_to_cluster_dict and reply_id in reply_to_cluster_dict:
        event_cluster_id = int(event_to_cluster_dict[event_id].split('_')[2])
        reply_cluster_id = int(reply_to_cluster_dict[reply_id].split('_')[2])
        cluster_connection_mat[event_cluster_id, reply_cluster_id] = 1

100%|██████████| 165759/165759 [01:10<00:00, 2345.85it/s]


In [29]:
cluster_connection_mat.shape

(2772, 8316)

In [170]:
connect_pairs = []
for i in range(cluster_connection_mat.shape[0])[70:100]:
    for j in range(cluster_connection_mat.shape[1])[50:800]:
        if cluster_connection_mat[i, j] == 1:
            connect_pairs.append(('event_cluster_' + str(i), 'reply_cluster_' + str(j)))

In [171]:
len(connect_pairs)

192

In [172]:
source_text_list = []
target_text_list = []
for i in connect_pairs:
    source_text_list.append(list(all_events_clusters_df[all_events_clusters_df['cluster_id'] == i[0]]['text'])[0])
    target_text_list.append(list(all_replies_clusters_df[all_replies_clusters_df['cluster_id'] == i[1]]['text'])[0])

In [173]:
network_df = pd.DataFrame({
    'source': source_text_list,
    'target': target_text_list
})

In [174]:
network_df

Unnamed: 0,source,target
0,Let's play a game.,How's your day going?
1,Let's play a game.,what's up?
2,Let's play a game.,What's your favorite movie?
3,Let's play a game.,What is your favourite color?
4,Let's play a game.,What is the meaning of life?
...,...,...
187,Lets chat!,That made my day
188,Lets chat!,Wish me luck :D
189,Lets chat!,Do you like anime?
190,Lets chat!,Howdy!


In [175]:
G = nx.from_pandas_edgelist(network_df, source = 'source', target = 'target')

In [176]:
source_nodes = set(list(network_df['source']))

In [177]:
net = Network('1000px', '1000px')

In [178]:
net.from_nx(G)

In [179]:
node_list = []
for i in net.nodes:
    if i['id'] in source_nodes:
        i['color'] = '#f4cccd'
    else:
        i['color'] = '#cde0f1'
    node_list.append(i)
net.nodes = node_list

In [180]:
edge_list = []
for i in net.edges:
    i['color'] = '#162347'
    edge_list.append(i)
net.edges = edge_list

In [181]:
net.show('example3.html')

In [46]:
# source_list = []
# source_text_list = []
# target_list = []
# target_text_list = []
# for i in tqdm(range(len(all_events_clusters_df))):
#     event_flag = 0
#     event_cluster_id = all_events_clusters_df.iloc[i]['cluster_id']
#     event_cluster_text = all_events_clusters_df.iloc[i]['text']
#     event_cluster_event_ids = all_events_clusters_df.iloc[i]['event_id']
#     for eid in event_cluster_event_ids:
#         for j in range(len(all_replies_clusters_df)):
#             reply_flag = 0
#             reply_cluster_id = all_replies_clusters_df.iloc[j]['cluster_id']
#             reply_cluster_text = all_replies_clusters_df.iloc[j]['text']
#             reply_cluster_reply_ids = all_replies_clusters_df.iloc[j]['reply_id']
#             for rid in reply_cluster_reply_ids:
#                 if rid in pair_dict[eid]:
#                     source_list.append(event_cluster_id)
#                     source_text_list.append(event_cluster_text)
#                     target_list.append(reply_cluster_id)
#                     target_text_list.append(reply_cluster_text)
#                     event_flag = 1
#                     reply_flag = 1
#                     break
#             if reply_flag == 1:
#                 break
#         if event_flag == 1:
#             break
                