# Necessary Path Settings

In [1]:
import sys
import os

# Add the project root to sys.path
sys.path.append(os.path.abspath(".."))

### Function Monitor

In [2]:
import time
import tracemalloc

def timed_memory_profile(func, *args, **kwargs):
    tracemalloc.start()
    start_time = time.perf_counter()

    result = func(*args, **kwargs)

    end_time = time.perf_counter()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    print(f"Execution time: {end_time - start_time:.4f} seconds")
    print(f"Current memory usage: {current / 1024:.2f} KB")
    print(f"Peak memory usage: {peak / 1024:.2f} KB")

    return result


### Static constants to load once

In [3]:
from nltk.corpus import stopwords, wordnet as wn
from src.services import (
    extract_top_n_nouns_with_frequency,
    get_sentences_with_target_word
)

STOP_WORDS = set(stopwords.words("english"))
ALL_NOUNS = {lemma.name().lower() for synset in wn.all_synsets(wn.NOUN) for lemma in synset.lemmas()}

## -> Book upload and text convert

### Example 1: Psychology book

In [4]:
with open('../static/uploads/psychology_explained.txt') as f:
    text_content = f.read()

print(len(text_content))

873235


## -> Noun Extraction with frequency

In [5]:
extract_top_n_nouns_with_frequency(text_content, 10, STOP_WORDS, ALL_NOUNS)

{'psychology': 621,
 'people': 338,
 'behavior': 316,
 'memory': 290,
 'theory': 254,
 'approach': 218,
 'personality': 215,
 'world': 211,
 'freud': 186,
 'intelligence': 185}

## -> Sentence Extraction with target word

In [6]:
# target word
target_word = 'behavior'

sentences = get_sentences_with_target_word(text_content, target_word, 300)

Truncated sentence to 308 tokens: Konrad Lorenz 78 Behavior is shaped by positive and negative reinforcement B.F. Skinner 86 Stop imag...


In [7]:
len(sentences)

300

In [8]:
sentences[98]

'however the results also revealed an interesting distinction between behavior learned by positive reinforcement and behavior elicited by negative stimuli'

## -> Similarity Matrix Generation

In [9]:
import torch
from transformers import BertModel, BertTokenizer
from src.models import DisambModel 

# Load model & tokenizer
bert_model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DisambModel(bert_model, bert_tokenizer, device)

# sentence_embeddings = []
# for sent in sentences:
#     try:
#         emb = model.forward(sent, target_word)
#         sentence_embeddings.append(emb)
#     except ValueError as e:
#         print(f"Skipping: {sent[:50]} — {e}")

# # (Optional) Stack for vector ops
# if sentence_embeddings:
#     embedding_matrix = torch.stack(sentence_embeddings)


In [10]:
from src.services import compute_cosine_similarity_matrix

sim_matrix, _ = compute_cosine_similarity_matrix(
    sentences=sentences,
    target_word=target_word,
    model=model
)

In [11]:
sim_matrix.shape

torch.Size([300, 300])

## -> Clustering

In [12]:
from src.services import suggest_num_clusters_with_data

suggest_num_clusters_with_data(sim_matrix)

(np.int64(3),
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [648.7823486328125,
  326.50323486328125,
  263.55328369140625,
  216.40771484375,
  187.58094787597656,
  170.65052795410156,
  156.81398010253906,
  147.8011474609375,
  138.90151977539062,
  132.7833251953125])