## Preprocessing

In [5]:
import pandas as pd
import re

data = pd.read_csv('test_selenium.csv')
summaries = data['Summary']
analyses = data['Analysis']
titles = data['Title']

corpus = [f"{title} {summary} {analysis}" for title, summary, analysis in zip(titles, summaries, analyses)]

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

preprocessed_corpus = [preprocess_text(text) for text in corpus]

tokenized_corpus = [text.split() for text in preprocessed_corpus]

## Training Word Embedding Model To Obtain Embeddings

In [8]:
from gensim.models import Word2Vec

Word2Vec_model_sg = Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4, sg=1)

Word2Vec_model_sg.save('Word2Vec_model_sg')

In [10]:
test_vector = Word2Vec_model_sg.wv['darkness']
print(test_vector)

[-0.18757609 -0.02523065  0.10822345  0.25279427  0.09404436 -0.14556998
  0.31167802  0.56639236 -0.07748945 -0.11939991 -0.2481058  -0.37848467
 -0.0188977  -0.1670922  -0.10497189 -0.08356968 -0.0615345   0.10890697
 -0.10079912 -0.40617284 -0.00819069 -0.1524098   0.16265742 -0.03898332
 -0.06940196  0.04204768 -0.03904338 -0.06583715 -0.15557028  0.20045884
  0.3325572   0.00232724  0.14604664 -0.25277588 -0.20083642  0.21060328
  0.01625244 -0.34341818 -0.21040498 -0.30221328  0.05520817 -0.08441182
  0.0313545   0.01122725  0.27458957 -0.2581353  -0.3079668  -0.02439034
 -0.22528283  0.05035021 -0.07226291 -0.00409833 -0.03894206  0.06516527
  0.09041492 -0.08996686  0.08819439 -0.06942315  0.07562473  0.10474674
  0.00300599 -0.16347326  0.36412808  0.02851762 -0.01989603  0.17973474
  0.06861334  0.25913286 -0.15470749  0.06786061 -0.00504779  0.20889491
  0.2030788  -0.25446317  0.15321966  0.13806863  0.31823     0.15149638
 -0.43128183 -0.07699223 -0.12870276 -0.00841347  0

In [18]:
similar_words = Word2Vec_model_sg.wv.most_similar('friend')
print(similar_words)

[('brother', 0.8718932867050171), ('wife', 0.8443353176116943), ('servant', 0.8382858633995056), ('sister', 0.8214103579521179), ('aunt', 0.8185446858406067), ('girlfriend', 0.8092584609985352), ('acquaintance', 0.802879810333252), ('grandfather', 0.8003736138343811), ('roommate', 0.7951571345329285), ('companion', 0.7915104627609253)]


In [25]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def sentence_vector(sentence, model):
    words = sentence.split()
    valid_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(valid_vectors, axis=0) if valid_vectors else np.zeros(model.vector_size)

user_input = "I am feeling very sad and lonely. I don't have any friends to talk to."
preprocessed_input = preprocess_text(user_input)
input_vector = sentence_vector(preprocessed_input, Word2Vec_model_sg)

corpus_vectors = [sentence_vector(entry, Word2Vec_model_sg) for entry in preprocessed_corpus]
similarities = cosine_similarity([input_vector], corpus_vectors)

most_similar_index = np.argmax(similarities)
print(f"Most similar entry: {corpus[most_similar_index]}")


Most similar entry: the-killers George brings Max and Al their meals, but they can’t remember who ordered what. As they eat, they catch George looking at them. Al suggests that “maybe the boy meant it for a joke,” and George laughs. Max tells him not to laugh and George says alright. To Al, Max says “he thinks it’s all right,” and Al replies, “Oh, he’s a thinker.” Max and Al want to seem like they are in control of the situation and know what they are doing, but their confusion over who ordered what reveals how easy it is to fluster them (and shows them to be either a little stupid or unobservant—an inauspicious beginning for criminals). Frustrated by this confusion, they again emasculate George to put him in his place as their inferior.
