# without time & memory testing

In [1]:
# import re
# from nltk import word_tokenize, pos_tag, FreqDist
# from nltk.corpus import stopwords
# from nltk.corpus import wordnet as wn

# def is_noun(word):
#     return bool(wn.synsets(word, pos=wn.NOUN))

# def _basic_clean_text(text: str) -> str:
#     """
#     Standardize and clean raw text by:
#     - Lowercasing
#     - Removing punctuation
#     - Normalizing whitespace

#     Args:
#         text (str): Raw input text.

#     Returns:
#         str: Cleaned and normalized text.
#     """
#     text = text.lower()
#     text = re.sub(r"[^\w\s]", " ", text)  # Remove punctuation
#     text = re.sub(r"\s+", " ", text)      # Normalize whitespace
#     return text.strip()

# def extract_top_n_nouns_with_frequency(text: str, top_n: int = 50) -> dict[str, int]:
#     if not text or not isinstance(text, str):
#         raise ValueError("Input must be a non-empty string.")

#     text = _basic_clean_text(text)
#     tokens = word_tokenize(text)
#     tokens = [t for t in tokens if t.isalpha()]

#     stop_words = set(stopwords.words("english"))
#     tokens = [t for t in tokens if t not in stop_words]

#     tagged_tokens = pos_tag(tokens)

#     # Filter nouns and check if they are valid nouns in WordNet
#     nouns = [
#         word for word, tag in tagged_tokens
#         if tag.startswith("NN") and len(word) > 1 and is_noun(word)
#     ]

#     freq_dist = FreqDist(nouns)
#     sorted_nouns = freq_dist.most_common(top_n)

#     return dict(sorted_nouns)


# with time & memory testing

In [2]:
import re
import time
import tracemalloc
from collections import Counter
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet as wn

# Pre-load stopwords and noun vocabulary once
STOP_WORDS = set(stopwords.words("english"))
ALL_NOUNS = {lemma.name().lower() for synset in wn.all_synsets(wn.NOUN) for lemma in synset.lemmas()}

def _basic_clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_top_n_nouns_with_frequency_v2(text: str, top_n: int, stop_words: set[str], all_nouns: set[str]) -> dict[str, int]:
    if not text or not isinstance(text, str):
        raise ValueError("Input must be a non-empty string.")

    cleaned = _basic_clean_text(text)
    tokens = word_tokenize(cleaned)
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]

    tagged_tokens = pos_tag(tokens)
    nouns = [
        word for word, tag in tagged_tokens
        if tag.startswith("NN") and len(word) > 1 and word in all_nouns
    ]

    freq = Counter(nouns)
    return dict(freq.most_common(top_n))

def timed_memory_profile(func, *args, **kwargs):
    tracemalloc.start()
    start_time = time.perf_counter()

    result = func(*args, **kwargs)

    end_time = time.perf_counter()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    print(f"Execution time: {end_time - start_time:.4f} seconds")
    print(f"Current memory usage: {current / 1024:.2f} KB")
    print(f"Peak memory usage: {peak / 1024:.2f} KB")

    return result

# Example usage:
# result = timed_memory_profile(
#     extract_top_n_nouns_with_frequency_v2, 
#     text_content, 
#     50, 
#     STOP_WORDS, 
#     ALL_NOUNS
# )


## 1. Archaic Tests (Although model was not made for this) - Used shakespeare book

In [3]:
with open('../static/uploads/Shakespeare-Complete-Works.txt') as f:
    text_content = f.read()

print(len(text_content))

5473240


In [10]:
result = timed_memory_profile(extract_top_n_nouns_with_frequency_v2, text_content, 1, STOP_WORDS, ALL_NOUNS)
print(result)

Execution time: 5.5504 seconds
Current memory usage: 112.29 KB
Peak memory usage: 14377.14 KB
{'psychology': 578}


# 2. A more modern test on Psychology book, which the noun tagger was made for

In [5]:
with open('../static/uploads/psychology_explained.txt') as f:
    text_content = f.read()

print(len(text_content))

873235


In [6]:
result = timed_memory_profile(extract_top_n_nouns_with_frequency_v2, text_content, 100, STOP_WORDS, ALL_NOUNS)
print(result)

Execution time: 5.4450 seconds
Current memory usage: 125.56 KB
Peak memory usage: 14376.14 KB
{'psychology': 578, 'people': 343, 'memory': 274, 'world': 218, 'personality': 216, 'approach': 215, 'behavior': 199, 'theory': 192, 'life': 189, 'intelligence': 184, 'time': 177, 'therapy': 168, 'study': 162, 'university': 161, 'research': 154, 'work': 152, 'development': 150, 'information': 145, 'way': 144, 'person': 140, 'mind': 135, 'experience': 130, 'brain': 129, 'child': 123, 'idea': 120, 'see': 120, 'process': 115, 'self': 114, 'psychologist': 106, 'response': 106, 'freud': 103, 'james': 101, 'environment': 98, 'skinner': 98, 'language': 96, 'feelings': 96, 'part': 94, 'things': 94, 'consciousness': 93, 'family': 93, 'behaviorism': 86, 'years': 86, 'john': 84, 'century': 84, 'war': 82, 'group': 82, 'change': 81, 'rogers': 81, 'perception': 80, 'term': 80, 'psychoanalysis': 79, 'stage': 78, 'order': 74, 'book': 73, 'man': 73, 'age': 71, 'ways': 71, 'ability': 71, 'example': 70, 'watson'

# 3. Test 3: A short corpus text of psychology

In [7]:
text_content2 = """Psychology is the scientific study of behavior and mental processes. It explores how individuals think, feel, and act both independently and within social contexts. Modern psychology covers a wide range of topics including cognition, emotion, motivation, personality, development, and mental health. Researchers use various methods such as experiments, observations, and surveys to understand human behavior. Advances in neuroscience have also deepened our understanding of the brain mechanisms underlying cognitive functions and emotional regulation. Applied psychology branches include clinical psychology, counseling, educational psychology, and industrial-organizational psychology, all aiming to improve well-being and performance across different settings.
"""

In [8]:
result = timed_memory_profile(extract_top_n_nouns_with_frequency_v2, text_content, 100, STOP_WORDS, ALL_NOUNS)
print(result)

Execution time: 5.4385 seconds
Current memory usage: 119.03 KB
Peak memory usage: 14376.08 KB
{'psychology': 578, 'people': 343, 'memory': 274, 'world': 218, 'personality': 216, 'approach': 215, 'behavior': 199, 'theory': 192, 'life': 189, 'intelligence': 184, 'time': 177, 'therapy': 168, 'study': 162, 'university': 161, 'research': 154, 'work': 152, 'development': 150, 'information': 145, 'way': 144, 'person': 140, 'mind': 135, 'experience': 130, 'brain': 129, 'child': 123, 'idea': 120, 'see': 120, 'process': 115, 'self': 114, 'psychologist': 106, 'response': 106, 'freud': 103, 'james': 101, 'environment': 98, 'skinner': 98, 'language': 96, 'feelings': 96, 'part': 94, 'things': 94, 'consciousness': 93, 'family': 93, 'behaviorism': 86, 'years': 86, 'john': 84, 'century': 84, 'war': 82, 'group': 82, 'change': 81, 'rogers': 81, 'perception': 80, 'term': 80, 'psychoanalysis': 79, 'stage': 78, 'order': 74, 'book': 73, 'man': 73, 'age': 71, 'ways': 71, 'ability': 71, 'example': 70, 'watson'

# Testing Time & Memory of the Preprocessing API

In [9]:
text = """Psychology is the scientific study of behavior and mental processes. ..."""

result = timed_memory_profile(extract_top_n_nouns_with_frequency_v2, text, 100, STOP_WORDS, ALL_NOUNS)
print(result)


Execution time: 0.0010 seconds
Current memory usage: 0.34 KB
Peak memory usage: 4.98 KB
{'psychology': 1, 'study': 1}
