In [1]:
import os
import csv
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# Read Encoded Data

In [2]:
all_posts_embed = np.load('../data/merged_q/all_posts_max_len_40_embed.npy')
all_replies_embed = np.load('../data/merged_q/all_replies_max_len_40_embed.npy')

In [3]:
print('all_posts_embed shape:', all_posts_embed.shape)
print('all_replies_embed shape:', all_replies_embed.shape)

all_posts_embed shape: (152680, 768)
all_replies_embed shape: (838785, 768)


# Clustering Thresholds

In [4]:
thresholds = [0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]

# Cluster the Posts

In [5]:
all_posts_clusters = {}

In [6]:
for threshold in thresholds:
    print('Clustering with threshold = {}...'.format(threshold))
    start_time = time.time()
    all_posts_clusters[threshold] = util.community_detection(all_posts_embed,
                                                             min_community_size = 2,
                                                             threshold = threshold)
    print('Done after {:.2f} sec'.format(time.time() - start_time))

Clustering with threshold = 0.95...
Done after 42.39 sec
Clustering with threshold = 0.9...
Done after 61.19 sec
Clustering with threshold = 0.85...
Done after 71.40 sec
Clustering with threshold = 0.8...
Done after 84.41 sec
Clustering with threshold = 0.75...
Done after 104.51 sec
Clustering with threshold = 0.7...
Done after 136.67 sec
Clustering with threshold = 0.65...
Done after 168.56 sec
Clustering with threshold = 0.6...
Done after 201.43 sec


In [9]:
for threshold in thresholds:
    print('----- Threshold = {} -----'.format(threshold))
    num_posts_clustered = 0
    for c in all_posts_clusters[threshold]:
        num_posts_clustered += len(c)
    print('Num clusters = {}'.format(len(all_posts_clusters[threshold])))
    print('Num posts clustered = {}'.format(num_posts_clustered))
    print('Remaining num posts = {}'.format(all_posts_embed.shape[0] - num_posts_clustered))
    print('Total num post nodes = {}'.format(all_posts_embed.shape[0] - num_posts_clustered + len(all_posts_clusters[threshold])))
    print()

----- Threshold = 0.95 -----
Num clusters = 3421
Num posts clustered = 10748
Remaining num posts = 141932
Total num post nodes = 145353

----- Threshold = 0.9 -----
Num clusters = 5024
Num posts clustered = 17180
Remaining num posts = 135500
Total num post nodes = 140524

----- Threshold = 0.85 -----
Num clusters = 6902
Num posts clustered = 25521
Remaining num posts = 127159
Total num post nodes = 134061

----- Threshold = 0.8 -----
Num clusters = 9095
Num posts clustered = 36613
Remaining num posts = 116067
Total num post nodes = 125162

----- Threshold = 0.75 -----
Num clusters = 11138
Num posts clustered = 49482
Remaining num posts = 103198
Total num post nodes = 114336

----- Threshold = 0.7 -----
Num clusters = 12786
Num posts clustered = 62681
Remaining num posts = 89999
Total num post nodes = 102785

----- Threshold = 0.65 -----
Num clusters = 12848
Num posts clustered = 72830
Remaining num posts = 79850
Total num post nodes = 92698

----- Threshold = 0.6 -----
Num clusters = 1

In [10]:
with open('../data/merged_q/all_posts_max_len_40_clusters.pickle', 'wb') as f:
    pickle.dump(all_posts_clusters, f)

### More

In [5]:
with open('../data/merged_q/all_posts_max_len_40_clusters.pickle', 'rb') as f:
    all_posts_clusters = pickle.load(f)

In [6]:
for threshold in thresholds:
    print('Clustering with threshold = {}...'.format(threshold))
    start_time = time.time()
    all_posts_clusters[threshold] = util.community_detection(all_posts_embed,
                                                             min_community_size = 2,
                                                             threshold = threshold)
    print('Done after {:.2f} sec'.format(time.time() - start_time))

Clustering with threshold = 0.55...
Done after 241.90 sec
Clustering with threshold = 0.5...
Done after 323.48 sec


In [7]:
for threshold in thresholds:
    print('----- Threshold = {} -----'.format(threshold))
    num_posts_clustered = 0
    for c in all_posts_clusters[threshold]:
        num_posts_clustered += len(c)
    print('Num clusters = {}'.format(len(all_posts_clusters[threshold])))
    print('Num posts clustered = {}'.format(num_posts_clustered))
    print('Remaining num posts = {}'.format(all_posts_embed.shape[0] - num_posts_clustered))
    print('Total num post nodes = {}'.format(all_posts_embed.shape[0] - num_posts_clustered + len(all_posts_clusters[threshold])))
    print()

----- Threshold = 0.55 -----
Num clusters = 7942
Num posts clustered = 72628
Remaining num posts = 80052
Total num post nodes = 87994

----- Threshold = 0.5 -----
Num clusters = 4767
Num posts clustered = 64103
Remaining num posts = 88577
Total num post nodes = 93344



In [8]:
with open('../data/merged_q/all_posts_max_len_40_clusters.pickle', 'wb') as f:
    pickle.dump(all_posts_clusters, f)

# Cluster the Replies (1/2)

In [5]:
N = all_replies_embed.shape[0]

In [6]:
all_replies_embed_1 = all_replies_embed[:N//2,:]
print('all_replies_embed_1 shape:', all_replies_embed_1.shape)

all_replies_embed_1 shape: (419392, 768)


In [15]:
all_replies_clusters_1 = {}

In [16]:
for threshold in thresholds:
    print('Clustering with threshold = {}...'.format(threshold))
    start_time = time.time()
    all_replies_clusters_1[threshold] = util.community_detection(all_replies_embed_1,
                                                                 min_community_size = 2,
                                                                 threshold = threshold)
    print('Done after {:.2f} sec'.format(time.time() - start_time))

Clustering with threshold = 0.95...
Done after 596.98 sec
Clustering with threshold = 0.9...
Done after 694.03 sec
Clustering with threshold = 0.85...
Done after 896.21 sec
Clustering with threshold = 0.8...
Done after 941.83 sec
Clustering with threshold = 0.75...
Done after 1138.90 sec
Clustering with threshold = 0.7...
Done after 1340.64 sec
Clustering with threshold = 0.65...
Done after 1547.62 sec
Clustering with threshold = 0.6...
Done after 1890.49 sec


In [17]:
for threshold in thresholds:
    print('----- Threshold = {} -----'.format(threshold))
    num_replies_clustered = 0
    for c in all_replies_clusters_1[threshold]:
        num_replies_clustered += len(c)
    print('Num clusters = {}'.format(len(all_replies_clusters_1[threshold])))
    print('Num replies clustered = {}'.format(num_replies_clustered))
    print('Remaining num replies = {}'.format(all_replies_embed_1.shape[0] - num_replies_clustered))
    print('Total num replies nodes = {}'.format(all_replies_embed_1.shape[0] - num_replies_clustered + len(all_replies_clusters_1[threshold])))
    print()

----- Threshold = 0.95 -----
Num clusters = 9421
Num replies clustered = 57781
Remaining num replies = 361611
Total num replies nodes = 371032

----- Threshold = 0.9 -----
Num clusters = 11597
Num replies clustered = 69355
Remaining num replies = 350037
Total num replies nodes = 361634

----- Threshold = 0.85 -----
Num clusters = 14885
Num replies clustered = 85384
Remaining num replies = 334008
Total num replies nodes = 348893

----- Threshold = 0.8 -----
Num clusters = 19777
Num replies clustered = 107965
Remaining num replies = 311427
Total num replies nodes = 331204

----- Threshold = 0.75 -----
Num clusters = 26190
Num replies clustered = 141081
Remaining num replies = 278311
Total num replies nodes = 304501

----- Threshold = 0.7 -----
Num clusters = 31740
Num replies clustered = 178557
Remaining num replies = 240835
Total num replies nodes = 272575

----- Threshold = 0.65 -----
Num clusters = 33543
Num replies clustered = 210632
Remaining num replies = 208760
Total num replies n

In [18]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_1.pickle', 'wb') as f:
    pickle.dump(all_replies_clusters_1, f)

### More

In [9]:
N = all_replies_embed.shape[0]
all_replies_embed_1 = all_replies_embed[:N//2,:]
print('all_replies_embed_1 shape:', all_replies_embed_1.shape)

all_replies_embed_1 shape: (419392, 768)


In [7]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_1.pickle', 'rb') as f:
    all_replies_clusters_1 = pickle.load(f)

In [11]:
for threshold in thresholds:
    print('Clustering with threshold = {}...'.format(threshold))
    start_time = time.time()
    all_replies_clusters_1[threshold] = util.community_detection(all_replies_embed_1,
                                                                 min_community_size = 2,
                                                                 threshold = threshold)
    print('Done after {:.2f} sec'.format(time.time() - start_time))

Clustering with threshold = 0.55...
Done after 2193.91 sec
Clustering with threshold = 0.5...
Done after 2467.25 sec


In [12]:
for threshold in thresholds:
    print('----- Threshold = {} -----'.format(threshold))
    num_replies_clustered = 0
    for c in all_replies_clusters_1[threshold]:
        num_replies_clustered += len(c)
    print('Num clusters = {}'.format(len(all_replies_clusters_1[threshold])))
    print('Num replies clustered = {}'.format(num_replies_clustered))
    print('Remaining num replies = {}'.format(all_replies_embed_1.shape[0] - num_replies_clustered))
    print('Total num replies nodes = {}'.format(all_replies_embed_1.shape[0] - num_replies_clustered + len(all_replies_clusters_1[threshold])))
    print()

----- Threshold = 0.55 -----
Num clusters = 20189
Num replies clustered = 213364
Remaining num replies = 206028
Total num replies nodes = 226217

----- Threshold = 0.5 -----
Num clusters = 10693
Num replies clustered = 182362
Remaining num replies = 237030
Total num replies nodes = 247723



In [13]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_1.pickle', 'wb') as f:
    pickle.dump(all_replies_clusters_1, f)

# Cluster the Replies (2/2)

In [8]:
N = all_replies_embed.shape[0]

In [9]:
all_replies_embed_2 = all_replies_embed[N//2:,:]
print('all_replies_embed_2 shape:', all_replies_embed_2.shape)

all_replies_embed_2 shape: (419393, 768)


In [21]:
all_replies_clusters_2 = {}

In [22]:
for threshold in thresholds:
    print('Clustering with threshold = {}...'.format(threshold))
    start_time = time.time()
    all_replies_clusters_2[threshold] = util.community_detection(all_replies_embed_2,
                                                                 min_community_size = 2,
                                                                 threshold = threshold)
    print('Done after {:.2f} sec'.format(time.time() - start_time))

Clustering with threshold = 0.95...
Done after 634.72 sec
Clustering with threshold = 0.9...
Done after 646.88 sec
Clustering with threshold = 0.85...
Done after 733.77 sec
Clustering with threshold = 0.8...
Done after 858.22 sec
Clustering with threshold = 0.75...
Done after 1042.77 sec
Clustering with threshold = 0.7...
Done after 1247.28 sec
Clustering with threshold = 0.65...
Done after 1489.71 sec
Clustering with threshold = 0.6...
Done after 1459.54 sec


In [23]:
for threshold in thresholds:
    print('----- Threshold = {} -----'.format(threshold))
    num_replies_clustered = 0
    for c in all_replies_clusters_2[threshold]:
        num_replies_clustered += len(c)
    print('Num clusters = {}'.format(len(all_replies_clusters_2[threshold])))
    print('Num replies clustered = {}'.format(num_replies_clustered))
    print('Remaining num replies = {}'.format(all_replies_embed_2.shape[0] - num_replies_clustered))
    print('Total num replies nodes = {}'.format(all_replies_embed_2.shape[0] - num_replies_clustered + len(all_replies_clusters_2[threshold])))
    print()

----- Threshold = 0.95 -----
Num clusters = 9219
Num replies clustered = 47244
Remaining num replies = 372149
Total num replies nodes = 381368

----- Threshold = 0.9 -----
Num clusters = 11644
Num replies clustered = 58180
Remaining num replies = 361213
Total num replies nodes = 372857

----- Threshold = 0.85 -----
Num clusters = 15530
Num replies clustered = 74747
Remaining num replies = 344646
Total num replies nodes = 360176

----- Threshold = 0.8 -----
Num clusters = 21082
Num replies clustered = 100087
Remaining num replies = 319306
Total num replies nodes = 340388

----- Threshold = 0.75 -----
Num clusters = 27880
Num replies clustered = 134660
Remaining num replies = 284733
Total num replies nodes = 312613

----- Threshold = 0.7 -----
Num clusters = 33544
Num replies clustered = 174781
Remaining num replies = 244612
Total num replies nodes = 278156

----- Threshold = 0.65 -----
Num clusters = 35281
Num replies clustered = 207180
Remaining num replies = 212213
Total num replies n

In [24]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_2.pickle', 'wb') as f:
    pickle.dump(all_replies_clusters_2, f)

### More

In [14]:
N = all_replies_embed.shape[0]
all_replies_embed_2 = all_replies_embed[N//2:,:]
print('all_replies_embed_2 shape:', all_replies_embed_2.shape)

all_replies_embed_2 shape: (419393, 768)


In [10]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_2.pickle', 'rb') as f:
    all_replies_clusters_2 = pickle.load(f)

In [16]:
for threshold in thresholds:
    print('Clustering with threshold = {}...'.format(threshold))
    start_time = time.time()
    all_replies_clusters_2[threshold] = util.community_detection(all_replies_embed_2,
                                                                 min_community_size = 2,
                                                                 threshold = threshold)
    print('Done after {:.2f} sec'.format(time.time() - start_time))

Clustering with threshold = 0.55...
Done after 1775.09 sec
Clustering with threshold = 0.5...
Done after 2218.94 sec


In [17]:
for threshold in thresholds:
    print('----- Threshold = {} -----'.format(threshold))
    num_replies_clustered = 0
    for c in all_replies_clusters_2[threshold]:
        num_replies_clustered += len(c)
    print('Num clusters = {}'.format(len(all_replies_clusters_2[threshold])))
    print('Num replies clustered = {}'.format(num_replies_clustered))
    print('Remaining num replies = {}'.format(all_replies_embed_2.shape[0] - num_replies_clustered))
    print('Total num replies nodes = {}'.format(all_replies_embed_2.shape[0] - num_replies_clustered + len(all_replies_clusters_2[threshold])))
    print()

----- Threshold = 0.55 -----
Num clusters = 20437
Num replies clustered = 206356
Remaining num replies = 213037
Total num replies nodes = 233474

----- Threshold = 0.5 -----
Num clusters = 10627
Num replies clustered = 176834
Remaining num replies = 242559
Total num replies nodes = 253186



In [18]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_2.pickle', 'wb') as f:
    pickle.dump(all_replies_clusters_2, f)

# Cluster the Replies (Centroids)

In [11]:
all_replies_clusters = {}

In [12]:
for threshold in thresholds:
    print('----- Threshold = {} -----'.format(threshold))
    start_time = time.time()
    replies_embed = []
    for c in all_replies_clusters_1[threshold]:
        replies_embed.append(all_replies_embed_1[c[0]])
    for c in all_replies_clusters_2[threshold]:
        replies_embed.append(all_replies_embed_2[c[0]])
    replies_embed = np.array(replies_embed)
    print('replies_embed shape:', replies_embed.shape)
    all_replies_clusters[threshold] = util.community_detection(replies_embed,
                                                               min_community_size = 1,
                                                               threshold = threshold)
    print('Num clusters:', len(all_replies_clusters[threshold]))
    print('Done after {:.2f} sec'.format(time.time() - start_time))
    print()

----- Threshold = 0.95 -----
replies_embed shape: (18640, 768)
Num clusters: 15723
Done after 5.57 sec

----- Threshold = 0.9 -----
replies_embed shape: (23241, 768)
Num clusters: 19873
Done after 7.99 sec

----- Threshold = 0.85 -----
replies_embed shape: (30415, 768)
Num clusters: 26368
Done after 11.52 sec

----- Threshold = 0.8 -----
replies_embed shape: (40859, 768)
Num clusters: 35542
Done after 23.52 sec

----- Threshold = 0.75 -----
replies_embed shape: (54070, 768)
Num clusters: 46987
Done after 36.73 sec

----- Threshold = 0.7 -----
replies_embed shape: (65284, 768)
Num clusters: 56529
Done after 54.63 sec

----- Threshold = 0.65 -----
replies_embed shape: (68824, 768)
Num clusters: 59409
Done after 58.33 sec

----- Threshold = 0.6 -----
replies_embed shape: (59677, 768)
Num clusters: 51610
Done after 45.79 sec

----- Threshold = 0.55 -----
replies_embed shape: (40626, 768)
Num clusters: 35326
Done after 22.33 sec

----- Threshold = 0.5 -----
replies_embed shape: (21320, 768)

In [13]:
all_replies_clusters_combine_1 = {}

In [14]:
N_1 = all_replies_embed_1.shape[0]
N_1

419392

In [15]:
for threshold in thresholds:
    clusters_1 = all_replies_clusters_1[threshold]
    clusters_2 = all_replies_clusters_2[threshold]
    clusters = []
    for c in all_replies_clusters[threshold]:
        cluster = []
        for i in c:
            if i < len(clusters_1):
                cluster += clusters_1[i]
            else:
                cluster += [j+N_1 for j in clusters_2[i-len(clusters_1)]]
        clusters.append(cluster)
    all_replies_clusters_combine_1[threshold] = clusters
    print('Threshold = {}, num clusters = {}'.format(threshold, len(clusters)))

Threshold = 0.95, num clusters = 15723
Threshold = 0.9, num clusters = 19873
Threshold = 0.85, num clusters = 26368
Threshold = 0.8, num clusters = 35542
Threshold = 0.75, num clusters = 46987
Threshold = 0.7, num clusters = 56529
Threshold = 0.65, num clusters = 59409
Threshold = 0.6, num clusters = 51610
Threshold = 0.55, num clusters = 35326
Threshold = 0.5, num clusters = 18747


In [16]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_combine_1_centroid.pickle', 'wb') as f:
    pickle.dump(all_replies_clusters_combine_1, f)

# Cluster the Replies (Mean)

In [17]:
all_replies_clusters = {}

In [18]:
for threshold in thresholds:
    print('----- Threshold = {} -----'.format(threshold))
    start_time = time.time()
    replies_embed = []
    for c in all_replies_clusters_1[threshold]:
        replies_embed.append(np.mean(all_replies_embed_1[c], axis = 0))
    for c in all_replies_clusters_2[threshold]:
        replies_embed.append(np.mean(all_replies_embed_2[c], axis = 0))
    replies_embed = np.array(replies_embed)
    print('replies_embed shape:', replies_embed.shape)
    all_replies_clusters[threshold] = util.community_detection(replies_embed,
                                                               min_community_size = 1,
                                                               threshold = threshold)
    print('Num clusters:', len(all_replies_clusters[threshold]))
    print('Done after {:.2f} sec'.format(time.time() - start_time))
    print()

----- Threshold = 0.95 -----
replies_embed shape: (18640, 768)
Num clusters: 15592
Done after 6.21 sec

----- Threshold = 0.9 -----
replies_embed shape: (23241, 768)
Num clusters: 18902
Done after 8.93 sec

----- Threshold = 0.85 -----
replies_embed shape: (30415, 768)
Num clusters: 23223
Done after 13.06 sec

----- Threshold = 0.8 -----
replies_embed shape: (40859, 768)
Num clusters: 27346
Done after 25.07 sec

----- Threshold = 0.75 -----
replies_embed shape: (54070, 768)
Num clusters: 28542
Done after 38.09 sec

----- Threshold = 0.7 -----
replies_embed shape: (65284, 768)
Num clusters: 24995
Done after 52.84 sec

----- Threshold = 0.65 -----
replies_embed shape: (68824, 768)
Num clusters: 18302
Done after 57.22 sec

----- Threshold = 0.6 -----
replies_embed shape: (59677, 768)
Num clusters: 11309
Done after 50.00 sec

----- Threshold = 0.55 -----
replies_embed shape: (40626, 768)
Num clusters: 5872
Done after 24.98 sec

----- Threshold = 0.5 -----
replies_embed shape: (21320, 768)


In [19]:
all_replies_clusters_combine_1 = {}

In [20]:
N_1 = all_replies_embed_1.shape[0]
N_1

419392

In [21]:
for threshold in thresholds:
    clusters_1 = all_replies_clusters_1[threshold]
    clusters_2 = all_replies_clusters_2[threshold]
    clusters = []
    for c in all_replies_clusters[threshold]:
        cluster = []
        for i in c:
            if i < len(clusters_1):
                cluster += clusters_1[i]
            else:
                cluster += [j+N_1 for j in clusters_2[i-len(clusters_1)]]
        clusters.append(cluster)
    all_replies_clusters_combine_1[threshold] = clusters
    print('Threshold = {}, num clusters = {}'.format(threshold, len(clusters)))

Threshold = 0.95, num clusters = 15592
Threshold = 0.9, num clusters = 18902
Threshold = 0.85, num clusters = 23223
Threshold = 0.8, num clusters = 27346
Threshold = 0.75, num clusters = 28542
Threshold = 0.7, num clusters = 24995
Threshold = 0.65, num clusters = 18302
Threshold = 0.6, num clusters = 11309
Threshold = 0.55, num clusters = 5872
Threshold = 0.5, num clusters = 2486


In [22]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_combine_1_mean.pickle', 'wb') as f:
    pickle.dump(all_replies_clusters_combine_1, f)

# Check

In [23]:
all_replies_df = pd.read_csv('../data/merged_q/all_replies_max_len_40.csv')

In [24]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_combine_1_mean.pickle', 'rb') as f:
    all_replies_clusters_combine_1_mean = pickle.load(f)

In [28]:
for threshold in thresholds:
    mean_sample_df = []
    clusters = all_replies_clusters_combine_1_mean[threshold]
    all_idx = [i for i in range(len(clusters)) if len(clusters[i]) >= 10]
    idx = np.random.choice(all_idx, 10, replace = False)
    for i in idx:
        cluster = clusters[i]
        rows = np.random.choice(cluster, 10, replace = False)
        mean_sample_df.append(all_replies_df.iloc[rows])
    mean_sample_df = pd.concat(mean_sample_df)
    mean_sample_df.to_csv('../data/merged_q/all_replies_max_len_40_clusters/samples/mean_sample_df_{:.2f}.csv'.format(threshold), index = False)

In [29]:
with open('../data/merged_q/all_replies_max_len_40_clusters/all_replies_clusters_combine_1_centroid.pickle', 'rb') as f:
    all_replies_clusters_combine_1_centroid = pickle.load(f)

In [30]:
for threshold in thresholds:
    centroid_sample_df = []
    clusters = all_replies_clusters_combine_1_centroid[threshold]
    all_idx = [i for i in range(len(clusters)) if len(clusters[i]) >= 10]
    idx = np.random.choice(all_idx, 10, replace = False)
    for i in idx:
        cluster = clusters[i]
        rows = np.random.choice(cluster, 10, replace = False)
        centroid_sample_df.append(all_replies_df.iloc[rows])
    centroid_sample_df = pd.concat(centroid_sample_df)
    centroid_sample_df.to_csv('../data/merged_q/all_replies_max_len_40_clusters/samples/centroid_sample_df_{:.2f}.csv'.format(threshold), index = False)

In [31]:
all_posts_df = pd.read_csv('../data/merged_q/all_posts_max_len_40.csv')

In [32]:
with open('../data/merged_q/all_posts_max_len_40_clusters/all_posts_clusters.pickle', 'rb') as f:
    all_posts_clusters = pickle.load(f)

In [34]:
for threshold in thresholds:
    sample_df = []
    clusters = all_posts_clusters[threshold]
    all_idx = [i for i in range(len(clusters)) if len(clusters[i]) >= 10]
    idx = np.random.choice(all_idx, 10, replace = False)
    for i in idx:
        cluster = clusters[i]
        rows = np.random.choice(cluster, 10, replace = False)
        sample_df.append(all_posts_df.iloc[rows])
    sample_df = pd.concat(sample_df)
    sample_df.to_csv('../data/merged_q/all_posts_max_len_40_clusters/samples/sample_df_{:.2f}.csv'.format(threshold), index = False)