In [1]:
import pandas as pd
df_all= pd.read_json('../data/_reddit-posts-gatherer-en.submissions_subset.json')
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32165 entries, 0 to 32164
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   clicked                 14656 non-null  float64       
 1   created_utc             32165 non-null  int64         
 2   date                    32165 non-null  datetime64[ns]
 3   distinguished           3 non-null      object        
 4   edited                  14656 non-null  float64       
 5   id                      32165 non-null  object        
 6   is_original_content     23387 non-null  float64       
 7   is_text_only            32165 non-null  bool          
 8   link_flair_template_id  0 non-null      float64       
 9   link_flair_text         0 non-null      float64       
 10  locked                  30580 non-null  float64       
 11  name                    14656 non-null  object        
 12  num_comments            32165 non-null  int64 

In [69]:
df = df_all[['body', 'url', 'title']].copy()

df['body'] = (
    df['body']
    .str.replace('\n', '. ', regex=False)               
    .str.replace(r'\s+', ' ', regex=True)                
    .str.replace(r'\.\s*\.', '.', regex=True)             
    .str.strip()                                            
)

df["text"] = df["title"].str.strip() + " " + df["body"].str.strip()
df = df[["text", "url"]]

df = df.dropna()
# df = df.drop_duplicates()

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32165 entries, 0 to 32164
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    32165 non-null  object
 1   url     32165 non-null  object
dtypes: object(2)
memory usage: 502.7+ KB


In [70]:
df

Unnamed: 0,text,url
0,I got my dad a beautiful piece of art for his ...,https://www.reddit.com/r/depression/comments/g...
1,- It's one of those nights where I want to kil...,https://www.reddit.com/r/depression/comments/g...
2,Is it alright to have a C (75) in my grade car...,https://www.reddit.com/r/depression/comments/k...
3,help How do you all deal with feelings of isol...,https://www.reddit.com/r/depression/comments/i...
4,Tired of feeling like a failure... There are p...,https://www.reddit.com/r/depression/comments/g...
...,...,...
32160,I have a fucked up story I feel I need to shar...,https://www.reddit.com/r/depression/comments/2...
32161,I dont know what to do Even when I should be h...,https://www.reddit.com/r/depression/comments/2...
32162,"The Day It Almost Ended. January 8th, 2014. Th...",https://www.reddit.com/r/depression/comments/1...
32163,Spending New Year's Eve with a fellow redditor...,https://www.reddit.com/r/depression/comments/1...


In [71]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

sentences = df['text'].to_list()

embeddings = model.encode(sentences,
                          batch_size=64)
print(embeddings.shape)


(32165, 384)


In [72]:
print("Max Sequence Length:", model.max_seq_length)

Max Sequence Length: 256


In [73]:
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)

torch.Size([32165, 32165])


In [None]:
import random
import torch
import numpy as np

def mine_triplets_simple(similarities, sentences, margin=0.3):
    """
    Simple and robust semi-hard triplet mining
    args:
        -similarities: matrix of similatriy of the sentences embeddings
        -sentences: list of sentences/ pargraphs
        -margin: margin off semmi hard mining (how close the negative to the postive) 
    
    """
    triplets = []
    n_samples = len(sentences)
    
    for anchor_idx in range(n_samples):
        sims = similarities[anchor_idx].clone()
        sims[anchor_idx] = -1  # Exclude self
        
        # Find positive: closest to thje  sample embedding
        pos_idx = sims.argmax().item()
        pos_sim = sims[pos_idx].item()
        
        
        mask = (sims < pos_sim) & (sims > pos_sim - margin) & (torch.arange(len(sims)) != pos_idx)
        candidate_neg_indices = torch.where(mask)[0]
        
        if len(candidate_neg_indices) == 0:
            # Fallback to hardest negative (closest among negatives)
            remaining_mask = torch.arange(len(sims)) != pos_idx
            remaining_indices = torch.where(remaining_mask)[0]
            if len(remaining_indices) > 0:
                neg_idx = remaining_indices[sims[remaining_indices].argmax()].item()
            else:
                continue
        else:
            neg_idx = random.choice(candidate_neg_indices.tolist())
            
        
        triplets.append({
            "anchor_idx": anchor_idx,
            "positive_idx": pos_idx,
            "negative_idx": neg_idx,
            "anchor": sentences[anchor_idx],
            "positive": sentences[pos_idx],
            "negative": sentences[neg_idx],
            "pos_sim": pos_sim,
            "neg_sim": sims[neg_idx].item(),
            "margin_violation": pos_sim - sims[neg_idx].item() < margin
        })
    
    return triplets

triplets = mine_triplets_simple(similarities, sentences, margin=0.3)

print(f"Generated {len(triplets)} triplets")
print(f"Margin violations: {sum(t['margin_violation'] for t in triplets)}")


pos_sims = [t['pos_sim'] for t in triplets]
neg_sims = [t['neg_sim'] for t in triplets]
margins = [t['pos_sim'] - t['neg_sim'] for t in triplets]
 
print(f"\nStatistics:")
print(f"Positive similarities: {np.mean(pos_sims):.4f} ± {np.std(pos_sims):.4f}")
print(f"Negative similarities: {np.mean(neg_sims):.4f} ± {np.std(neg_sims):.4f}")
print(f"Actual margins: {np.mean(margins):.4f} ± {np.std(margins):.4f}")

Generated 32165 triplets
Margin violations: 32152

Statistics:
Positive similarities: 0.7152 ± 0.0759
Negative similarities: 0.4789 ± 0.0959
Actual margins: 0.2364 ± 0.0515


In [75]:
triplet_df = pd.DataFrame.from_dict(triplets)
triplet_df

Unnamed: 0,anchor_idx,positive_idx,negative_idx,anchor,positive,negative,pos_sim,neg_sim,margin_violation
0,0,15263,16623,I got my dad a beautiful piece of art for his ...,Nothing has changed Not much time has passed s...,Cannot even come up with a title anymore.. -Th...,0.652456,0.434251,True
1,1,17917,2167,- It's one of those nights where I want to kil...,The only reason I haven't killed myself is the...,I did not ask to be alive Life is just going d...,0.729542,0.481335,True
2,2,3249,10820,Is it alright to have a C (75) in my grade car...,My school is worried about my attendance. Ther...,Anyone else relate? For some background inform...,0.726155,0.510943,True
3,3,27305,16882,help How do you all deal with feelings of isol...,Realizing you are completely and utterly alone...,I don't really know how i feel Apologies if th...,0.675287,0.495427,True
4,4,29572,7755,Tired of feeling like a failure... There are p...,failure I dunno how everyone seems to do well ...,I'm hopeless I'm 18 y/o kissless virgin. I've ...,0.736482,0.491487,True
...,...,...,...,...,...,...,...,...,...
32160,32160,9862,8305,I have a fucked up story I feel I need to shar...,"My story I've never really posted here, and I ...",Need some help with this depression or whateve...,0.727004,0.604711,True
32161,32161,20970,2061,I dont know what to do Even when I should be h...,Holy fuck Nothing makes me happy. I’m in colle...,Feels Like All My Passion is Just Gone My life...,0.814365,0.522068,True
32162,32162,31361,9685,"The Day It Almost Ended. January 8th, 2014. Th...",My Story.(I want someone to know) If you asked...,I'm lonely and don't know what to do about it ...,0.669649,0.407014,True
32163,32163,9095,849,Spending New Year's Eve with a fellow redditor...,Hello Anybody want to spend New Year's talking...,This has definitely been my banner year. For m...,0.628374,0.383603,True


In [89]:
only_triple_df=triplet_df[['anchor','positive','negative']].copy()
only_triple_df

Unnamed: 0,anchor,positive,negative
0,I got my dad a beautiful piece of art for his ...,Nothing has changed Not much time has passed s...,Cannot even come up with a title anymore.. -Th...
1,- It's one of those nights where I want to kil...,The only reason I haven't killed myself is the...,I did not ask to be alive Life is just going d...
2,Is it alright to have a C (75) in my grade car...,My school is worried about my attendance. Ther...,Anyone else relate? For some background inform...
3,help How do you all deal with feelings of isol...,Realizing you are completely and utterly alone...,I don't really know how i feel Apologies if th...
4,Tired of feeling like a failure... There are p...,failure I dunno how everyone seems to do well ...,I'm hopeless I'm 18 y/o kissless virgin. I've ...
...,...,...,...
32160,I have a fucked up story I feel I need to shar...,"My story I've never really posted here, and I ...",Need some help with this depression or whateve...
32161,I dont know what to do Even when I should be h...,Holy fuck Nothing makes me happy. I’m in colle...,Feels Like All My Passion is Just Gone My life...
32162,"The Day It Almost Ended. January 8th, 2014. Th...",My Story.(I want someone to know) If you asked...,I'm lonely and don't know what to do about it ...
32163,Spending New Year's Eve with a fellow redditor...,Hello Anybody want to spend New Year's talking...,This has definitely been my banner year. For m...


In [90]:
print(len(df), len(only_triple_df),len(triplet_df))

32165 32165 32165


In [91]:
final_df = df.merge(only_triple_df, left_index=True, right_index=True, how='left')
final_df

Unnamed: 0,text,url,anchor,positive,negative
0,I got my dad a beautiful piece of art for his ...,https://www.reddit.com/r/depression/comments/g...,I got my dad a beautiful piece of art for his ...,Nothing has changed Not much time has passed s...,Cannot even come up with a title anymore.. -Th...
1,- It's one of those nights where I want to kil...,https://www.reddit.com/r/depression/comments/g...,- It's one of those nights where I want to kil...,The only reason I haven't killed myself is the...,I did not ask to be alive Life is just going d...
2,Is it alright to have a C (75) in my grade car...,https://www.reddit.com/r/depression/comments/k...,Is it alright to have a C (75) in my grade car...,My school is worried about my attendance. Ther...,Anyone else relate? For some background inform...
3,help How do you all deal with feelings of isol...,https://www.reddit.com/r/depression/comments/i...,help How do you all deal with feelings of isol...,Realizing you are completely and utterly alone...,I don't really know how i feel Apologies if th...
4,Tired of feeling like a failure... There are p...,https://www.reddit.com/r/depression/comments/g...,Tired of feeling like a failure... There are p...,failure I dunno how everyone seems to do well ...,I'm hopeless I'm 18 y/o kissless virgin. I've ...
...,...,...,...,...,...
32160,I have a fucked up story I feel I need to shar...,https://www.reddit.com/r/depression/comments/2...,I have a fucked up story I feel I need to shar...,"My story I've never really posted here, and I ...",Need some help with this depression or whateve...
32161,I dont know what to do Even when I should be h...,https://www.reddit.com/r/depression/comments/2...,I dont know what to do Even when I should be h...,Holy fuck Nothing makes me happy. I’m in colle...,Feels Like All My Passion is Just Gone My life...
32162,"The Day It Almost Ended. January 8th, 2014. Th...",https://www.reddit.com/r/depression/comments/1...,"The Day It Almost Ended. January 8th, 2014. Th...",My Story.(I want someone to know) If you asked...,I'm lonely and don't know what to do about it ...
32163,Spending New Year's Eve with a fellow redditor...,https://www.reddit.com/r/depression/comments/1...,Spending New Year's Eve with a fellow redditor...,Hello Anybody want to spend New Year's talking...,This has definitely been my banner year. For m...


In [80]:
final_df.shape

(32165, 5)

In [81]:
final_df.isna().sum()


text        0
url         0
anchor      0
positive    0
negative    0
dtype: int64

In [92]:
final_df.drop(columns=['anchor'],inplace=True)
final_df

Unnamed: 0,text,url,positive,negative
0,I got my dad a beautiful piece of art for his ...,https://www.reddit.com/r/depression/comments/g...,Nothing has changed Not much time has passed s...,Cannot even come up with a title anymore.. -Th...
1,- It's one of those nights where I want to kil...,https://www.reddit.com/r/depression/comments/g...,The only reason I haven't killed myself is the...,I did not ask to be alive Life is just going d...
2,Is it alright to have a C (75) in my grade car...,https://www.reddit.com/r/depression/comments/k...,My school is worried about my attendance. Ther...,Anyone else relate? For some background inform...
3,help How do you all deal with feelings of isol...,https://www.reddit.com/r/depression/comments/i...,Realizing you are completely and utterly alone...,I don't really know how i feel Apologies if th...
4,Tired of feeling like a failure... There are p...,https://www.reddit.com/r/depression/comments/g...,failure I dunno how everyone seems to do well ...,I'm hopeless I'm 18 y/o kissless virgin. I've ...
...,...,...,...,...
32160,I have a fucked up story I feel I need to shar...,https://www.reddit.com/r/depression/comments/2...,"My story I've never really posted here, and I ...",Need some help with this depression or whateve...
32161,I dont know what to do Even when I should be h...,https://www.reddit.com/r/depression/comments/2...,Holy fuck Nothing makes me happy. I’m in colle...,Feels Like All My Passion is Just Gone My life...
32162,"The Day It Almost Ended. January 8th, 2014. Th...",https://www.reddit.com/r/depression/comments/1...,My Story.(I want someone to know) If you asked...,I'm lonely and don't know what to do about it ...
32163,Spending New Year's Eve with a fellow redditor...,https://www.reddit.com/r/depression/comments/1...,Hello Anybody want to spend New Year's talking...,This has definitely been my banner year. For m...


In [None]:
final_df.to_csv('../data/r_depression_posts.csv',index=False)

NameError: name 'final_df' is not defined