In [None]:
import warnings
warnings.filterwarnings('ignore')

Downloading the necessary modules and import the libraries

In [None]:
!pip install nltk gensim pywsd



In [None]:
import nltk
import requests
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

Downloading the necessary resources for the nltk library

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('universal_tagset')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_dat

True

In [None]:
# Downloading the test of the book
url = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text
# Pruning official information
start_match = re.search(r'CHAPTER I\s*\n\s*[A-Z]', text)
if start_match:
    start = start_match.start()
else:
    start = text.find("CHAPTER I")
end = text.find("THE END")
text = text[start:end]
# Dividing it into chapters
chapters = re.split(r'CHAPTER [IVXLCDM]+', text)[13:]

Preprocessing the text: reducing it to lowercase, removing stop words and applying lemmatization.

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess(text):

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
    return ' '.join(words)

processed_chapters = [preprocess(chapter) for chapter in chapters]

Finding 10 most important words for each chapter usinf TF-IDF matrix

Then generating a potential title for each Chapter

In [None]:
def thematic_tfidf_analysis(chapters):

    carroll_stop_words = ['alice', 'said', 'little', 'quite', 'rather', 'always', 'never', 'something', 'anything', 'everything', 'nothing']
    vectorizer = TfidfVectorizer(
        max_features=20,
        stop_words=carroll_stop_words,
        min_df=1,
        max_df=0.6,
        ngram_range=(1, 3)
    )
    tfidf_matrix = vectorizer.fit_transform(processed_chapters)
    feature_names = vectorizer.get_feature_names_out()
    chapter_keywords = []
    for i, chapter in enumerate(tfidf_matrix):
        scores = chapter.toarray().flatten()
        weighted_scores = []
        for j, score in enumerate(scores):
            word = feature_names[j]
            length_bonus = 1.2 if len(word) > 5 else 1.0
            if word in ['thought', 'began', 'turned', 'seemed']:
                length_bonus *= 0.8
            weighted_scores.append(score * length_bonus)
        top_keywords = [feature_names[j] for j in np.array(weighted_scores).argsort()[-10:][::-1]]
        chapter_keywords.append(top_keywords)

    return chapter_keywords

def generate_meaningful_titles(chapter_keywords, chapters_original):
    chapter_titles = []
    for i, keywords in enumerate(chapter_keywords):
        meaningful_words = []
        for word in keywords[:5]:
            if len(word) > 4 and not word.endswith(('ing', 'ed', 'ly')):
                meaningful_words.append(word)
            if len(meaningful_words) >= 3:
                break
        if len(meaningful_words) < 2:
            meaningful_words = keywords[:3]
        title = " ".join(meaningful_words).title()
        chapter_titles.append(f"Chapter {i+1}: {title}")

    return chapter_titles

thematic_keywords = thematic_tfidf_analysis(chapters)
thematic_titles = generate_meaningful_titles(thematic_keywords, chapters)

for i, (keywords, title) in enumerate(zip(thematic_keywords, thematic_titles)):
    print(f"Chapter {i+1}: {keywords}")
    print(f"Title: {title}\n")

Chapter 1: ['door', 'rabbit', 'cat', 'mouse', 'replied', 'three', 'turtle', 'queen', 'march hare', 'march']
Title: Chapter 1: Rabbit Mouse

Chapter 2: ['mouse', 'cat', 'rabbit', 'door', 'duchess', 'replied', 'turtle', 'three', 'march hare', 'queen']
Title: Chapter 2: Mouse Rabbit Duchess

Chapter 3: ['mouse', 'course', 'replied', 'cat', 'three', 'turtle', 'rabbit', 'queen', 'march hare', 'march']
Title: Chapter 3: Mouse Course Three

Chapter 4: ['rabbit', 'door', 'duchess', 'mouse', 'caterpillar', 'three', 'replied', 'turtle', 'march hare', 'queen']
Title: Chapter 4: Rabbit Duchess Mouse

Chapter 5: ['caterpillar', 'replied', 'three', 'door', 'rabbit', 'turtle', 'queen', 'mouse', 'march hare', 'march']
Title: Chapter 5: Caterpillar Three Rabbit

Chapter 6: ['cat', 'duchess', 'door', 'march', 'march hare', 'hatter', 'hare', 'queen', 'replied', 'rabbit']
Title: Chapter 6: Duchess March March Hare

Chapter 7: ['hatter', 'dormouse', 'march hare', 'march', 'hare', 'replied', 'course', 'thre

The TF-IDF algorithm has successfully identified the thematic structure of the work, generating meaningful chapter titles based on key characters and objects. The method demonstrated the ability to track the development of the plot through the appearance and disappearance of specific characters, but retained limitations in the form of some repetition of common terms and the mechanistic nature of the titles, which, although reflecting the substantive elements of the chapters, do not always convey their semantic depth or emotional coloring.

Let's try another method - Word2vec to compare the results

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from collections import Counter
import string
import numpy as np

Creating the function with Word2vec method to find 10 important words and generate chapters' titles

In [None]:
def chapter_analysis_word2vec(chapters):

    chapter_sentences = []
    for i, chapter in enumerate(chapters):
      # Splitting the chapter into sentences
        sentences = sent_tokenize(chapter)
        chapter_sentence_tokens = []
        # Tokenization and text purification
        for sent in sentences:
            words = word_tokenize(sent.lower())
            words = [word for word in words
                    if word not in stop_words
                    and word not in string.punctuation
                    and len(word) > 2
                    and word not in ['alice', 'said']]
            if words:
                chapter_sentence_tokens.append(words)

        chapter_sentences.append(chapter_sentence_tokens)
    # Combining the data for training
    all_sentences = []
    for chapter in chapter_sentences:
        all_sentences.extend(chapter)
    # Training Word2vec model
    model = Word2Vec(
        sentences=all_sentences,
        vector_size=100,
        window=5,
        min_count=2,
        workers=4,
        epochs=50
    )

    print(f"Dict size: {len(model.wv.key_to_index)}")
    chapter_keywords_w2v = []
    chapter_titles_w2v = []
    for i, chapter in enumerate(chapter_sentences):
        chapter_words = []
        for sentence in chapter:
            chapter_words.extend(sentence)

        if not chapter_words:
            print(f"Chapter {i+1}: no matching words")
            chapter_keywords_w2v.append([f"word_{j}" for j in range(1, 11)])
            chapter_titles_w2v.append(f"Chapter {i+1}")
            continue
        # Analisys of words frequency
        word_freq = Counter(chapter_words)
        common_words = [word for word, count in word_freq.most_common(50) if word in model.wv.key_to_index]

        if not common_words:
            print(f"Chapter {i+1}: no words in model")
            backup_words = [word for word, count in word_freq.most_common(20) if word not in stop_words and len(word) > 2]
            if len(backup_words) >= 10:
                chapter_keywords_w2v.append(backup_words[:10])
            else:
                while len(backup_words) < 10:
                    backup_words.extend(backup_words)
                chapter_keywords_w2v.append(backup_words[:10])
            chapter_titles_w2v.append(f"Chapter {i+1}")
            continue

        chapter_vectors = []
        for word in common_words[:20]:
            if word in model.wv.key_to_index:
                chapter_vectors.append(model.wv[word])

        if not chapter_vectors:
            backup_words = [word for word, count in word_freq.most_common(10)]
            chapter_keywords_w2v.append(backup_words)
            chapter_titles_w2v.append(f"Chapter {i+1}")
            continue

        mean_vector = np.mean(chapter_vectors, axis=0)
        try:
            similar_words = model.wv.similar_by_vector(
                mean_vector,
                topn=30,
                restrict_vocab=None
            )

            filtered_words = []
            for word, score in similar_words:
                if (word not in stop_words and
                    len(word) > 2 and
                    word not in ['said', 'like', 'would', 'could', 'one', 'went', 'go'] and
                    word_freq.get(word, 0) >= 1):
                    filtered_words.append((word, score, word_freq.get(word, 0)))

            filtered_words.sort(key=lambda x: (x[1] * 0.7 + min(x[2]/10, 1) * 0.3), reverse=True)
            top_keywords = [word for word, score, freq in filtered_words[:10]]

            if len(top_keywords) < 10:
                additional_words = [word for word, count in word_freq.most_common(20) if word not in top_keywords and word not in stop_words and len(word) > 2]

                for word in additional_words:
                    if len(top_keywords) >= 10:
                        break
                    if word not in top_keywords:
                        top_keywords.append(word)
            top_keywords = top_keywords[:10]
            chapter_keywords_w2v.append(top_keywords)
            title = " ".join(top_keywords[:3]).title()
            chapter_titles_w2v.append(f"Chapter {i+1}: {title}")

        except Exception as e:
            print(f"Error in analisys {i+1}: {e}")
            backup_words = [word for word, count in word_freq.most_common(10)]
            chapter_keywords_w2v.append(backup_words)
            chapter_titles_w2v.append(f"Chapter {i+1}")

    return chapter_keywords_w2v, chapter_titles_w2v, model

chapter_keywords_w2v, chapter_titles_w2v, w2v_model = chapter_analysis_word2vec(chapters)

for i, (keywords, title) in enumerate(zip(chapter_keywords_w2v, chapter_titles_w2v)):
    print(f"\nChapter {i+1}: {title}")
    print(f"Kye words: {keywords}")

Dict size: 1332

Chapter 1: Chapter 1: Either Fall Marked
Kye words: ['either', 'fall', 'marked', 'drink', 'happens', 'latitude', 'earth', 'larger', 'matter', 'occurred']

Chapter 2: Chapter 2: English Remembered Stay
Kye words: ['english', 'remembered', 'stay', 'tail', 'strange', 'besides', 'number', 'walk', 'speaking', 'directions']

Chapter 3: Chapter 3: Dry Tale English
Kye words: ['dry', 'tale', 'english', 'wanted', 'stay', 'temper', 'absurd', 'world', 'mouse', 'dodo']

Chapter 4: Chapter 4: Window Noticed Bark
Kye words: ['window', 'noticed', 'bark', 'together', 'foot', 'taking', 'effect', 'middle', 'pressed', 'face']

Chapter 5: Chapter 5: Eggs Serpents _What_
Kye words: ['eggs', 'serpents', '_what_', 'temper', 'keep', 'case', 'jaws', 'managed', 'number', 'several']

Chapter 6: Chapter 6: Creature Wood Vanished
Kye words: ['creature', 'wood', 'vanished', 'matter', 'help', 'yet.', 'nearer', 'either', 'extraordinary', 'themâ€”']

Chapter 7: Chapter 7: Answer Butter Live
Kye words:

The Word2vec method has demonstrated the ability to identify semantically rich and contextually specific words for each chapter, generating original and linguistically diverse titles. However, the algorithm showed a tendency to identify semantically vivid but plot-related terms, which led to the creation of low-content titles that poorly correlate with the main events of the chapters and key characters of the work, which indicates the limited applicability of the approach for the tasks of accurate thematic analysis of narrative texts.

Finding the top 10 most used verbs in sentences with Alice.
Using search of phrasal verbs

In [None]:
def analyze_alice_verbs_with_phrasal(chapters):

    grammar = r"""
    PHRASAL_VERB:
        {<VB.?><RP|IN|RB>}
    """
    # Creating a parser for highlighting syntactic groups
    chunker = nltk.RegexpParser(grammar)
    # Collecting all the sentences where Alice is mentioned
    all_alice_sentences = []
    for i, chapter in enumerate(chapters):
        sentences = sent_tokenize(chapter)
        for sent in sentences:
            if re.search(r'\balice\b', sent, re.IGNORECASE):
                all_alice_sentences.append(sent)

    verbs = []
    stop_verbs = {'was', 'is', 'were', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'are', 'am'}
    used_indices = set()

    for i, sent in enumerate(all_alice_sentences):
        try:
            words = word_tokenize(sent)
            pos_tags = nltk.pos_tag(words)
            used_indices.clear()
            tree = chunker.parse(pos_tags)
            #  Looking for phrasal verbs in the parsing tree
            for subtree in tree.subtrees():
                if subtree.label() == 'PHRASAL_VERB':
                    phrasal_verb = ' '.join(word for word, tag in subtree.leaves()).lower()
                    if len(phrasal_verb) > 1:
                        verbs.append(phrasal_verb)
                    start_index = None
                    for j, (word, tag) in enumerate(pos_tags):
                        if (word, tag) in subtree.leaves():
                            if start_index is None:
                                start_index = j
                            used_indices.add(j)
            # Processing single verbs that are not included in the compound verbs
            for j, (word, tag) in enumerate(pos_tags):
                if j in used_indices:
                    continue

                if tag.startswith('VB'):
                    if (word.isalpha() and
                        len(word) > 1 and
                        word.lower() not in stop_verbs):

                        lemma = lemmatizer.lemmatize(word.lower(), pos='v')
                        if lemma.isalpha() and len(lemma) > 1:
                            verbs.append(lemma)
        except Exception as e:
            if i < 10:
                print(f"Error in sentence {i}: {e}")
            continue

    verb_counts = Counter(verbs)
    top_verbs = verb_counts.most_common(15)

    return top_verbs, all_alice_sentences

top_verbs_with_phrasal, alice_sentences = analyze_alice_verbs_with_phrasal(chapters)
print("Alice's 15 most frequent actions")
for i, (verb, count) in enumerate(top_verbs_with_phrasal, 1):
    print(f"{verb:20} {count:4}")

Alice's 15 most frequent actions
say                   270
think                  65
know                   39
see                    34
begin                  30
go                     30
get                    29
make                   27
don                    27
take                   26
went on                23
tell                   23
be                     21
find                   20
seem                   20


Here we cat see that the algorith found verbs which are most frequent. the list includes combined words like "went on", but also there are some mistakes like wod "don" - most likely it's the verb don't because there is the removing punctuation marks and the lemmatization is used so it is not the word "done".

Finding the top 10 most used verbs in sentences with Alice.
Using the Word2vec method with one central verb and a list of similar words

In [None]:
def analyze_verbs_word2vec(chapters):

    all_alice_sentences = []
    for chapter in chapters:
        sentences = sent_tokenize(chapter)
        for sent in sentences:
            if re.search(r'\balice\b', sent, re.IGNORECASE):
                all_alice_sentences.append(sent)

    processed_sentences = []
    for sent in all_alice_sentences:
        words = word_tokenize(sent.lower())
        words = [word for word in words
                if word.isalpha() and word not in stop_words and len(word) > 2]
        if words:
            processed_sentences.append(words)

    model = Word2Vec(
        sentences=processed_sentences,
        vector_size=100,
        window=5,
        min_count=2,
        workers=4,
        epochs=30
    )

    verbs = []
    stop_verbs = {'was', 'is', 'were', 'be', 'have', 'has', 'had', 'do', 'does', 'did'}
    for sent in all_alice_sentences:
        try:
            words = word_tokenize(sent)
            pos_tags = nltk.pos_tag(words)

            for word, tag in pos_tags:
                if tag.startswith('VB'):
                    lemma = lemmatizer.lemmatize(word.lower(), pos='v')
                    if (lemma.isalpha() and len(lemma) > 1 and
                        lemma not in stop_verbs and lemma in model.wv):
                        verbs.append(lemma)
        except:
            continue

    verb_groups = cluster_verbs_word2vec(verbs, model)

    return verb_groups, verbs, model

# Clusterizes verbs into semantically similar groups using K-means based on Word2Vec vector representations
def cluster_verbs_word2vec(verbs, model, n_clusters=8):

    from sklearn.cluster import KMeans
    from sklearn.metrics.pairwise import cosine_similarity
    unique_verbs = list(set([v for v in verbs if v in model.wv]))

    if not unique_verbs:
        return {}
    verb_vectors = np.array([model.wv[verb] for verb in unique_verbs])

    kmeans = KMeans(n_clusters=min(n_clusters, len(unique_verbs)), random_state=42)
    clusters = kmeans.fit_predict(verb_vectors)

    verb_groups = {}
    for verb, cluster_id in zip(unique_verbs, clusters):
        if cluster_id not in verb_groups:
            verb_groups[cluster_id] = []
        verb_groups[cluster_id].append(verb)

    cluster_centers = {}
    for cluster_id in verb_groups:
        cluster_verbs = verb_groups[cluster_id]
        if len(cluster_verbs) > 1:
            cluster_vector = np.mean([model.wv[verb] for verb in cluster_verbs], axis=0)
            similarities = [cosine_similarity([model.wv[verb]], [cluster_vector])[0][0] for verb in cluster_verbs]
            best_verb = cluster_verbs[np.argmax(similarities)]
            cluster_centers[cluster_id] = (best_verb, cluster_verbs)
        else:
            cluster_centers[cluster_id] = (cluster_verbs[0], cluster_verbs)

    return cluster_centers

verb_groups, all_verbs, w2v_model = analyze_verbs_word2vec(chapters)

for cluster_id, (center_verb, group_verbs) in verb_groups.items():
    print(f"\nCentral verb: {center_verb}")
    print(f" Verbs: {', '.join(group_verbs)}")


Central verb: alarm
 Verbs: fall, upset, rush, lie, doubt, chance, hold, hurt, repeat, notice, ashamed, introduce, sound, twelve, drink, treacle, alarm, stay

Central verb: finish
 Verbs: appear, answer, queer, particular, give, stand, reply, explain, fear, manage, happen, box, surprise, sentence, finish, learn, run

Central verb: want
 Verbs: understand, wonder, sir, live, want, feel, speak, draw, remember, saw, begin, change, talk, call, hear, fan, suppose, close, sleep, keep, rest

Central verb: twice
 Verbs: walk, thoroughly, breathe, listen, beg, become, yer, twice, dare, play, help, beat, read, turn, set, dance, escape, fell

Central verb: face
 Verbs: grow, use, get, say, make, hurry, leave, face, afraid, dear, see, please, felt, think, look, tell, watch, put

Central verb: matter
 Verbs: move, matter, dream, seem, try, grin, mean, spread, lay, croquet, ask, eat, savage, believe, remark, dry, mind, allow, wish, care

Central verb: find
 Verbs: round, next, let, long, find, hand

An analysis of the verb clusterization results using the Word2vec and k-means method revealed significant limitations of this approach for semantic text analysis. The algorithm demonstrated a low clustering quality, expressed in the semantic heterogeneity of the groups. The problem is compounded by the incorrect definition of central words, where rare terms such as "alarm" or the adverb "twice" do not reflect the semantic core of their clusters. The method also failed to filter parts of speech by including non-verbs in groups, which indicates the disadvantages of data preprocessing. The results obtained demonstrate the fundamental limitations of the method when working with small text data rather than providing meaningful linguistic information.
