# Experimentation Notebook for Single Documents

Will be used for experimenting various methods on individual documents. Methods will be abstracted

### Document Breakup

In [5]:
import helper
import docx

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import re

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/virajkacker/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
def get_sentences(file_path):
    # Load the document
    doc = docx.Document(file_path)

    # Iterate over each paragraph in the document
    sentences = []
    for para in doc.paragraphs:
        # Split the paragraph into sentences
        for sent in nltk.sent_tokenize(para.text):
            # Append the sentence to the list
            sentences.append(sent)
    
    return sentences

In [2]:
file_path = '../congress_report/18th-CCP-Congress-Report.docx'

In [3]:
# Paragraphs
paragraphs = helper.get_paragraphs(file_path)

In [9]:
# Split paragraphs into sentences
sentences = get_sentences(file_path)

### Data Cleaning Methods

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [13]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z]', ' ', text)

    # Tokenize text and remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Lemmatize each word
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Remove short words (less than 3 characters)
    words = [word for word in words if len(word) > 2]

    return ' '.join(words)

In [16]:
cleaned_sentences = [clean_text(sent) for sent in sentences]
# Remote all '' from the cleaned_paragraphs 
cleaned_sentences = [para for para in cleaned_sentences if para != '']

In [37]:
cleaned_paragraphs = [clean_text(para) for para in paragraphs]
# Remote all '' from the cleaned_paragraphs 
cleaned_paragraphs = [para for para in cleaned_paragraphs if para != '']

### BERT Topic Test

In [34]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = embedding_model.encode(cleaned_sentences, show_progress_bar=True)

.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 832kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 182kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 7.71MB/s]
config.json: 100%|██████████| 571/571 [00:00<00:00, 584kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 90.1kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 18.8MB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [00:31<00:00, 13.9MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 18.0kB/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 122kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 13.9MB/s]
tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 127kB/s]
train_script.py: 100%|██████████| 13.1k/13.1k [00:00<00:00, 4.07MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 17.2MB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 263kB/s]
Batches: 100%|██████████| 2

In [40]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [41]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

In [42]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama 2
# llama2 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    # "Llama2": llama2,
    "MMR": mmr,
}

In [43]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
)

# Train model
topics, probs = topic_model.fit_transform(cleaned_sentences, embeddings)

In [44]:
# Show topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,187,-1_social_people_socialist_law,"[social, people, socialist, law, security, imp...","[socialist, management, organization, governme...","[social, people, socialist, law, security, imp...",[must intensify effort improve basic public se...
1,0,248,0_party_improve_people_official,"[party, improve, people, official, system, lev...","[governance, democracy, reform, democratic, le...","[party, improve, people, official, system, lev...",[expand coverage party organization party work...
2,1,154,1_chinese_socialism_characteristic_china,"[chinese, socialism, characteristic, china, de...","[zedong, marxism, socialism, mao, socialist, c...","[chinese, socialism, characteristic, china, de...",[path socialism chinese characteristic way rea...
3,2,54,2_rural_income_urban_distribution,"[rural, income, urban, distribution, developme...","[urbanization, industrialization, agriculture,...","[rural, income, urban, distribution, developme...",[promote coordinated development social securi...
4,3,42,3_taiwan_china_kong_macao,"[taiwan, china, kong, macao, hong, common, two...","[hong, taiwan, mainland, macao, sovereignty, c...","[taiwan, china, kong, macao, hong, common, two...",[central government also firmly support chief ...
5,4,38,4_ecological_resource_environment_consumption,"[ecological, resource, environment, consumptio...","[ecological, environmental, ecosystem, conserv...","[ecological, resource, environment, consumptio...",[move faster set system ecological progress im...
6,5,36,5_international_world_cooperation_global,"[international, world, cooperation, global, co...","[interventionism, globalization, prosperity, c...","[international, world, cooperation, global, co...",[develop securing peaceful international envir...
7,6,27,6_innovation_technology_research_science,"[innovation, technology, research, science, te...","[innovation, technology, implement, develop, t...","[innovation, technology, research, science, te...",[speed research development application new te...
8,7,24,7_cultural_public_sector_service,"[cultural, public, sector, service, people, fl...","[cultural, culture, industry, promote, art, pr...","[cultural, public, sector, service, people, fl...",[cultural work created system public cultural ...
9,8,23,8_military_armed_force_defense,"[military, armed, force, defense, national, ne...","[army, military, modernization, strengthening,...","[military, armed, force, defense, national, ne...",[follow guideline strengthening national defen...


### Basic Topic Modeling

In [31]:
from bertopic import BERTopic

topic_model = BERTopic(min_topic_size=5) 
topics, probabilities = topic_model.fit_transform(cleaned_sentences, embeddings)

In [32]:
# Retrieve the most frequent topics
topic_model.get_topic_freq().head(10)

Unnamed: 0,Topic,Count
0,0,807
1,1,26


In [33]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,807,0_party_people_development_system,"[party, people, development, system, chinese, ...",[party member improve party spirit ensure unit...
1,1,26,1_military_force_defense_armed,"[military, force, defense, armed, national, ne...",[follow guideline strengthening national defen...


In [20]:
topic_model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed