In [1]:
# Imports

import string
import pandas as pd

In [2]:
# Jaccard Similarity Helper Funcs

def tokenize(text):
    # Lowercasing, removing punctuation and splitting into words
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    return set(tokens)

def jaccard_similarity(set1, set2):
    # Calculate the Jaccard similarity
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

### Load Data

In [3]:
# Load data

data = pd.read_csv("../../dataset/reddit_cleansed_data.csv")


In [4]:
# Calculate weighted score (how good a post is)
data['weighted_score'] = data['score'] + (10 * data['num_comments']) + (100 * data['gilded_count']) 
data


Unnamed: 0,title,selftext,score,num_comments,gilded_count,date,timestamp,weighted_score
0,"""Do not expose any part of your body to the air.""","""I repeat..this is not a drill..""",65.0,5.0,0,1.428090e+09,2015-04-03 19:47:13,115.0
1,"I sometimes remember the way he looked, broken...",I neglected to make sure he was dead.,22.0,0.0,0,1.428235e+09,2015-04-05 11:55:10,22.0
2,I live alone on the third floor of my apartmen...,So who opens my window every night while I'm s...,35.0,3.0,0,1.428370e+09,2015-04-07 01:24:42,65.0
3,"I heard the rain hitting my window, so I walke...","My window wasn't wet, but the glass was covere...",28.0,3.0,0,1.428385e+09,2015-04-07 05:40:55,58.0
4,You know how sometimes your brain plays tricks...,I caught one of those things today.,84.0,6.0,0,1.428563e+09,2015-04-09 07:03:16,144.0
...,...,...,...,...,...,...,...,...
94081,As I look thru at window I see something inhumane,My reflection helps me remember how well my su...,31.0,2.0,0,1.680377e+09,2023-04-01 19:21:54,51.0
94082,I’ve always been passionate about conspiracy t...,"So when my wife had twins, I knew exactly what...",27.0,8.0,0,1.680377e+09,2023-04-01 19:24:55,107.0
94083,"""You'll see me on the red carpet one day,"" sai...","So I paid her a surprise visit, and upon walki...",23.0,2.0,0,1.680378e+09,2023-04-01 19:38:03,43.0
94084,I could hear my sister screaming nearby as I s...,But my heart sank when I remembered the monste...,60.0,3.0,0,1.680378e+09,2023-04-01 19:41:01,90.0


In [5]:
dataset_stories = (data['title'] + ' ' + data['selftext']).to_list()


### Calculate Jaccard Similarity

In [11]:
# Sample data
# user_story = 'I ate hamburger. But I didn’t realize it was hamburger.'  # Replace with the user-generated story
user_story = '"I was horrified when I get my test results back. I was even more horrified when i found out that i was the only one in the country who had ever had a child."'

# Tokenize the user story
user_tokens = tokenize(user_story)

# Initialize variables to find the most similar story
max_similarity = 0
most_similar_story = ''

# Compare with each story in the dataset
for story in dataset_stories:
    story_tokens = tokenize(story)
    similarity = jaccard_similarity(user_tokens, story_tokens)
    if similarity > max_similarity:
        max_similarity = similarity
        most_similar_story = story

print(f"Most similar story: {most_similar_story} with Jaccard similarity: {max_similarity}")
print(f"Most similar story score: {data[data['title'] + ' ' + data['selftext'] == most_similar_story]['weighted_score'].values[0]}")

Most similar story: The test results came in. I was the father. with Jaccard similarity: 0.2857142857142857
Most similar story score: 79.0
