In [1]:
# CELL 1 - Install Dependencies

# Install topic modeling, NLP, and visualization libraries
!pip install -q gensim pyldavis nltk spacy

# Download spaCy's small English language model
# This provides tokenization, lemmatization, stopwords, etc.
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# CELL 2 — Imports & Paths

# ---------------- BASIC UTILITIES ----------------
import os              # File and directory handling
import re              # Regular expressions for text cleaning
import pickle          # Save and load Python objects
import json            # Save topic labels in readable format
import warnings        # Control warning messages

# ---------------- DATA HANDLING ----------------
import pandas as pd    # DataFrames and CSV handling

# ---------------- NLP ----------------
import nltk            # Tokenizers (used indirectly)
import spacy           # Lemmatization, stopwords

# ---------------- TOPIC MODELING ----------------
from gensim import corpora, models
from gensim.models import CoherenceModel


# Path to dataset attached on Kaggle (READ ONLY)
DATASET_PATH = "/kaggle/input/mywic-dataset/Myiwc.txt"

# Path where all outputs will be saved (WRITE ENABLED)
SAVE_PATH = "/kaggle/working/LDA_Models"

# Create directory if it does not exist
# This ensures outputs survive after notebook execution
os.makedirs(SAVE_PATH, exist_ok=True)

print("Output directory ready:", SAVE_PATH)


  from google.cloud.aiplatform.utils import gcs_utils


Output directory ready: /kaggle/working/LDA_Models


In [None]:
# # to reuse model later

# # in new kaggle notebook
# import pickle

# with open("/kaggle/input/your-notebook-output/lda_model.pkl", "rb") as f:
#     lda_model = pickle.load(f)

In [3]:
# CELL 3 — Load Dataset

# Load the dataset (tab-separated file)
df = pd.read_csv(
    DATASET_PATH,
    sep="\t",             # Dataset is TSV
    on_bad_lines="skip"   # Skip corrupted lines safely
)

# Extract message text and remove null values
documents = df["Message"].dropna().tolist()

print(f"Total documents loaded: {len(documents)}")


Total documents loaded: 24870


In [4]:
# CELL 4 — NLP Setup

# Download NLTK tokenizer resources (quiet mode)
nltk.download("punkt", quiet=True)

# Load spaCy English model
# Disable parser and NER to reduce computation time
nlp = spacy.load(
    "en_core_web_sm",
    disable=["parser", "ner"]
)

print("spaCy NLP pipeline loaded successfully.")


spaCy NLP pipeline loaded successfully.


In [5]:
# CELL 5 — Preprocessing Function (FINAL, CORRECT)

def preprocess_spacy(text):
    """
    Cleans and preprocesses a single document:
    1. Removes URLs
    2. Keeps only English letters
    3. Lemmatizes words
    4. Removes stopwords
    5. Removes short and non-alphabetic tokens
    """

    # Convert to string (safety)
    text = str(text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove numbers, symbols, and non-English characters
    text = re.sub(r"[^a-zA-Z\s]", " ", text)

    # Process text using spaCy
    doc = nlp(text)

    # Return clean tokens
    return [
        token.lemma_.lower()      # Lemmatized lowercase word
        for token in doc
        if token.is_alpha         # Only alphabetic tokens
        and not token.is_stop     # Remove stopwords
        and len(token.lemma_) > 2 # Remove very short words
    ]


# Apply preprocessing to all documents
processed_docs = [preprocess_spacy(doc) for doc in documents]

print("Text preprocessing completed.")


Text preprocessing completed.


In [6]:
# CELL 6 — Dictionary & Corpus

# Create dictionary (word → id mapping)
dictionary = corpora.Dictionary(processed_docs)

# Remove:
# - Words appearing in <5 documents (noise)
# - Words appearing in >50% documents (too common)
dictionary.filter_extremes(
    no_below=5,
    no_above=0.5
)

# Convert documents to Bag-of-Words representation
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print("Dictionary size:", len(dictionary))
print("Corpus creation successful.")


Dictionary size: 22866
Corpus creation successful.


In [7]:
# CELL 7 — Train LDA Model

# Number of topics to extract
NUM_TOPICS = 10

# Train LDA model using Gensim
lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    random_state=100,   # Reproducibility
    passes=10           # More passes = better convergence
)

print("LDA model training completed.")


LDA model training completed.


In [8]:
# NEW CELL 7.1 — PRINT THE 10 TOPICS
print("\n--- Top 10 Topics ---")

for topic_id in range(lda_model.num_topics):
    words = lda_model.show_topic(topic_id, topn=10)
    clean_words = [word for word, _ in words]
    print(f"Topic {topic_id}: {clean_words}")


--- Top 10 Topics ---
Topic 0: ['quot', 'woman', 'man', 'say', 'child', 'come', 'day', 'tell', 'wife', 'year']
Topic 1: ['quot', 'israel', 'war', 'israeli', 'say', 'iraq', 'attack', 'palestinian', 'kill', 'american']
Topic 2: ['quot', 'jesus', 'god', 'bible', 'say', 'qur', 'word', 'christians', 'believe', 'book']
Topic 3: ['god', 'human', 'earth', 'life', 'man', 'create', 'spirit', 'heaven', 'time', 'world']
Topic 4: ['site', 'amp', 'read', 'good', 'time', 'link', 'help', 'find', 'learn', 'arabic']
Topic 5: ['prophet', 'say', 'ibn', 'allaah', 'abu', 'hadith', 'muhammad', 'peace', 'prayer', 'day']
Topic 6: ['quot', 'people', 'muslim', 'muslims', 'world', 'country', 'islamic', 'islam', 'state', 'religion']
Topic 7: ['taliban', 'pakistan', 'afghanistan', 'money', 'food', 'disease', 'oil', 'medical', 'pig', 'health']
Topic 8: ['islam', 'muslim', 'know', 'think', 'muslims', 'people', 'religion', 'don', 'like', 'good']
Topic 9: ['allah', 'quot', 'say', 'good', 'people', 'love', 'heart', 'pr

In [9]:
# CELL 8 — Compute & Save Coherence Score

# Initialize coherence model using C_v metric
coherence_model = CoherenceModel(
    model=lda_model,
    texts=processed_docs,
    dictionary=dictionary,
    coherence="c_v"
)

# Compute coherence score
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score (C_v): {coherence_score:.4f}")


# Save coherence score for reporting and reproducibility
with open(f"{SAVE_PATH}/coherence_score.txt", "w") as f:
    f.write(f"Coherence Score (C_v): {coherence_score:.4f}")


Coherence Score (C_v): 0.4734


In [10]:
# NEW CELL 8.1 — TOPIC LABEL DEFINITION

topic_labels = {
    0: "Family & Social Life",
    1: "Middle East Geopolitics",
    2: "Interfaith Dialogue (Bible/Quran)",
    3: "Spirituality & Theology",
    4: "Web Resources & Arabic Learning",
    5: "Hadith & Prophetic Teachings",
    6: "Global Muslim Community (Ummah)",
    7: "Regional Conflict & Public Health",
    8: "Forum Discussion & General Opinion",
    9: "Religious Devotion & Morality"
}

print("Custom topic labels assigned.")


Custom topic labels assigned.


In [11]:
# NEW CELL 8.2 — DOCUMENT CATEGORIZATION (IMPORTANT)

categorized_docs = []

for i in range(len(corpus)):
    # Get topic probability distribution for document
    topic_distribution = lda_model[corpus[i]]

    if not topic_distribution:
        continue

    # Select topic with highest probability
    dominant_topic = max(topic_distribution, key=lambda x: x[1])

    categorized_docs.append({
        "Original Text": documents[i],
        "Category ID": dominant_topic[0],
        "Category Label": topic_labels.get(dominant_topic[0], "Unknown"),
        "Probability": round(dominant_topic[1], 4)
    })

# Convert results to DataFrame
df_results = pd.DataFrame(categorized_docs)

print("Document categorization completed.")
df_results.head()


Document categorization completed.


Unnamed: 0,Original Text,Category ID,Category Label,Probability
0,I am not sure either but I think there is a ha...,9,Religious Devotion & Morality,0.4386
1,The Virtues Of SalaamThe Islamic GreetingCompi...,5,Hadith & Prophetic Teachings,0.5936
2,A ssalamu A laikumThe Prophet (may Allah's ble...,9,Religious Devotion & Morality,0.4821
3,Let's Shine Together - Hand in Hand&quot;The w...,3,Spirituality & Theology,0.4039
4,[updated:LAST EDITED ON 14-07-02 AT 09:20&amp;...,0,Family & Social Life,0.4871


In [12]:
# NEW CELL 8.3 — SAVE CATEGORIZATION RESULTS

df_results.to_csv(
    f"{SAVE_PATH}/categorized_documents.csv",
    index=False
)

print("Categorized documents saved to disk.")


Categorized documents saved to disk.


In [13]:
# CELL 9 — Save Trained Model & Supporting Files

# Save LDA model
with open(f"{SAVE_PATH}/lda_model.pkl", "wb") as f:
    pickle.dump(lda_model, f)

# Save dictionary
with open(f"{SAVE_PATH}/dictionary.pkl", "wb") as f:
    pickle.dump(dictionary, f)

# Save corpus
with open(f"{SAVE_PATH}/corpus.pkl", "wb") as f:
    pickle.dump(corpus, f)

# Save original documents
with open(f"{SAVE_PATH}/documents.pkl", "wb") as f:
    pickle.dump(documents, f)

# Save topic labels
with open(f"{SAVE_PATH}/topic_labels.json", "w") as f:
    json.dump(topic_labels, f, indent=4)

print("Model and metadata saved successfully.")



Model and metadata saved successfully.


In [14]:
# CELL 10 — pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Suppress non-critical warnings for clean output
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Enable notebook visualization
pyLDAvis.enable_notebook()

# Prepare visualization
vis_data = gensimvis.prepare(
    lda_model,
    corpus,
    dictionary
)

vis_data
