In [1]:
# Imports

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

### Load Data

In [2]:
# Load data

data = pd.read_csv("../../dataset/reddit_cleansed_data.csv")

In [3]:
# Calculate weighted score (how good a post is)
data['weighted_score'] = data['score'] + (10 * data['num_comments']) + (100 * data['gilded_count']) 
data

Unnamed: 0,title,selftext,score,num_comments,gilded_count,date,timestamp,weighted_score
0,"""Do not expose any part of your body to the air.""","""I repeat..this is not a drill..""",65.0,5.0,0,1.428090e+09,2015-04-03 19:47:13,115.0
1,"I sometimes remember the way he looked, broken...",I neglected to make sure he was dead.,22.0,0.0,0,1.428235e+09,2015-04-05 11:55:10,22.0
2,I live alone on the third floor of my apartmen...,So who opens my window every night while I'm s...,35.0,3.0,0,1.428370e+09,2015-04-07 01:24:42,65.0
3,"I heard the rain hitting my window, so I walke...","My window wasn't wet, but the glass was covere...",28.0,3.0,0,1.428385e+09,2015-04-07 05:40:55,58.0
4,You know how sometimes your brain plays tricks...,I caught one of those things today.,84.0,6.0,0,1.428563e+09,2015-04-09 07:03:16,144.0
...,...,...,...,...,...,...,...,...
94081,As I look thru at window I see something inhumane,My reflection helps me remember how well my su...,31.0,2.0,0,1.680377e+09,2023-04-01 19:21:54,51.0
94082,I’ve always been passionate about conspiracy t...,"So when my wife had twins, I knew exactly what...",27.0,8.0,0,1.680377e+09,2023-04-01 19:24:55,107.0
94083,"""You'll see me on the red carpet one day,"" sai...","So I paid her a surprise visit, and upon walki...",23.0,2.0,0,1.680378e+09,2023-04-01 19:38:03,43.0
94084,I could hear my sister screaming nearby as I s...,But my heart sank when I remembered the monste...,60.0,3.0,0,1.680378e+09,2023-04-01 19:41:01,90.0


In [4]:
dataset_stories = (data['title'] + ' ' + data['selftext']).to_list()


In [5]:
generated_stories = [
    "I ate hamburger. But I didn’t realize it was hamburger.",
    "I was horrified when I get my test results back. I was even more horrified when i found out that i was the only one in the country who had ever had a child.",
    "My parents told me not to go upstairs. They never told me what to do when i got up there.",
    "There was a ghost. I didn't know what to do when i saw it.",
    "I got out of bed this morning. I woke up in the middle of the night to the sound of my own voice saying “I’m here for you.”"
]

In [6]:
# Combine both for vectorization
all_stories = dataset_stories + generated_stories


### Cosine Similarity (with TF-IDF)

In [7]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_stories)


In [8]:
# Split the matrix into dataset and generated stories parts
dataset_matrix = tfidf_matrix[:len(dataset_stories)]
generated_matrix = tfidf_matrix[len(dataset_stories):]


In [9]:
# Compute cosine similarity
for generated_vec in generated_matrix:
    similarities = cosine_similarity(generated_vec, dataset_matrix)
    most_similar_story_idx = similarities.argmax()
    most_similar_story = dataset_stories[most_similar_story_idx]
    print(f"Most similar story: {most_similar_story} with similarity score: {similarities[0, most_similar_story_idx]}")
    
    # Get df['weighted_score'] of that post
    print(f"Weighted score: {data['weighted_score'][most_similar_story_idx]}")


Most similar story: My neighbor had a new grill, and I wanted it. So, I pounded his face into hamburger, and ripped it out of his mouth. with similarity score: 0.3935262343839005
Weighted score: 327.0
Most similar story: I was horrified when the police finally figured out where I had buried my victims. I was even more horrified when they found nothing. with similarity score: 0.5473659793876382
Weighted score: 59.0
Most similar story: My mom always told me not to talk to strangers. She never told me what to do if there was one inside my house. with similarity score: 0.4973127525003452
Weighted score: 42.0
Most similar story: I don't believe people who have said that they saw a ghost. Because if they saw a ghost just like how I saw one, they will never say a word about what they saw. with similarity score: 0.44578407604753745
Weighted score: 128.0
Most similar story: I woke up to the sound of my phone ringing in the middle of the night. When I answered, I heard my own voice on the other 