In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

# This will be the path where your models are saved
save_path = '/content/drive/MyDrive/LDA_Models'

# This checks if the folder exists, and if not, it creates it
if not os.path.exists(save_path):
    os.makedirs(save_path)
    print(f"Folder '{save_path}' created.")
else:
    print(f"Folder '{save_path}' already exists.")

Folder '/content/drive/MyDrive/LDA_Models' already exists.


In [8]:
!pip install gensim pyldavis nltk spacy

# Download the small English model for spaCy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
from google.colab import files
import io

print("Please upload your .txt file:")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
print(f"\nSuccessfully uploaded: {file_name}")

Please upload your .txt file:


Saving Ansar1.txt to Ansar1.txt

Successfully uploaded: Ansar1.txt


In [7]:
# --- Import all necessary libraries ---
import pandas as pd
import re
import io
import pickle
import os
import spacy
from gensim import corpora, models
import nltk

# --- 1. Download NLTK assets (just in case) ---
print("Downloading NLTK assets...")
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# --- 2. Load the spaCy Model ---
print("Loading spaCy model...")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
print("spaCy model loaded.")

# --- 3. Load Data ---
# This assumes 'uploaded' and 'file_name' are in memory from Step 4
try:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]), sep='\t', on_bad_lines='skip')
    documents = df['Message'].dropna().tolist()
    print(f"\nLoaded {len(documents)} documents from {file_name}.")
except Exception as e:
    print(f"Error reading file: {e}. Please re-run Step 4 (Upload).")

# --- 4. CORRECTED Preprocessing Function (using spaCy) ---
def preprocess_spacy(text):
    text = str(text)

    # 1. Remove URLs
    text = re.sub(r'http\S+|www\S+|httpsS+', '', text, flags=re.MULTILINE)

    # 2. THE FIX: Keep only English letters and spaces
    # This re-introduces the filter you correctly pointed out was missing.
    text = re.sub(r'[^a-zA-Z\s]', ' ', text) # Replace non-English with a space

    processed_tokens = []

    # 3. Process the (now English-only) text with spaCy
    doc = nlp(text)

    for token in doc:
        # 4. Check token attributes:
        # (is_alpha, not stopword, and longer than 2 letters)
        if token.is_alpha and not token.is_stop and len(token.lemma_) > 2:
            # 5. Get the lowercase lemma (root form)
            processed_tokens.append(token.lemma_.lower())

    return processed_tokens

# --- 5. Apply Preprocessing ---
print("Preprocessing all documents with (Corrected) spaCy function...")
processed_docs = [preprocess_spacy(doc) for doc in documents]
print("Preprocessing complete.")

# --- 6. Create Dictionary and Corpus ---
print("Creating dictionary and corpus...")
dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print("Dictionary and corpus created.")

# --- 7. Build and Train the LDA Model ---
if not corpus or all(not doc for doc in corpus):
    print("\n--- ERROR: Corpus is empty after preprocessing. ---")
else:
    NUM_TOPICS = 10
    print(f"Training LDA model with {NUM_TOPICS} topics...")

    lda_model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=NUM_TOPICS,
        random_state=100,
        passes=10
    )
    print("Model training complete.")

    # --- 8. Print the Topics ---
    print(f"\n--- Top {NUM_TOPICS} Topics ---")
    topics = lda_model.print_topics(num_words=10)
    for topic in topics:
        print(topic)

    # --- 9. Save Model, Dictionary, and Corpus to Google Drive ---
    print("\n--- Saving outputs to Google Drive ---")

    # This should be the 'save_path' variable from your Step 2
    if 'save_path' not in locals():
        save_path = '/content/drive/MyDrive/LDA_Models'

    base_name = os.path.splitext(file_name)[0]

    model_path = os.path.join(save_path, f'{base_name}_lda.model')
    dict_path = os.path.join(save_path, f'{base_name}_dictionary.dict')
    corpus_path = os.path.join(save_path, f'{base_name}_corpus.pkl')
    docs_path = os.path.join(save_path, f'{base_name}_documents.pkl')

    with open(model_path, 'wb') as f: pickle.dump(lda_model, f)
    with open(dict_path, 'wb') as f: pickle.dump(dictionary, f)
    with open(corpus_path, 'wb') as f: pickle.dump(corpus, f)
    with open(docs_path, 'wb') as f: pickle.dump(documents, f)

    print(f"Successfully saved model files to: {save_path}")

Downloading NLTK assets...
Loading spaCy model...
spaCy model loaded.

Loaded 28866 documents from Ansar1.txt.
Preprocessing all documents with (Corrected) spaCy function...
Preprocessing complete.
Creating dictionary and corpus...
Dictionary and corpus created.
Training LDA model with 10 topics...
Model training complete.

--- Top 10 Topics ---
(0, '0.045*"somalia" + 0.027*"somali" + 0.022*"government" + 0.016*"mogadishu" + 0.016*"say" + 0.014*"shabaab" + 0.011*"town" + 0.011*"group" + 0.011*"islamist" + 0.011*"shabab"')
(1, '0.023*"islamic" + 0.013*"people" + 0.013*"muslims" + 0.012*"jihad" + 0.011*"muslim" + 0.011*"group" + 0.010*"state" + 0.010*"islam" + 0.008*"god" + 0.008*"country"')
(2, '0.034*"pakistan" + 0.034*"taliban" + 0.022*"say" + 0.019*"attack" + 0.019*"militant" + 0.015*"pakistani" + 0.012*"official" + 0.011*"government" + 0.010*"kill" + 0.009*"army"')
(3, '0.038*"israel" + 0.031*"israeli" + 0.030*"hamas" + 0.028*"gaza" + 0.027*"pirate" + 0.020*"palestinian" + 0.018*"sa

In [None]:
import pandas as pd
import io

# --- 1. Check if Model is in Memory ---
print("Checking for trained model...")
try:
    if 'lda_model' not in locals() or 'corpus' not in locals() or 'documents' not in locals():
        raise NameError("model_not_found")
    print("Model found in memory. Proceeding...")
except NameError:
    print("\n--- ERROR ---")
    print("Could not find the 'lda_model', 'corpus', or 'documents' variables.")
    print("Please run Step 5 (the main training cell) again.")
    print("OR, if you are in a new session, run Step 6 (the 'Load and Go' cell).")

# --- 2. Extract Keywords (for quick review) ---
print("\n--- Topic Keywords (Review) ---")
for i in range(lda_model.num_topics):
    words = lda_model.show_topic(i, topn=7) # Show top 7
    clean_words = [word for word, prob in words]
    print(f"Topic {i}: {clean_words}")


# --- 3. YOUR TASK: Define Your Topic Labels Here ---
# I have filled this in based on the topics from your last output.
# You can change these names to be whatever you think is best!

print("\n--- Using Your Custom Labels ---")
topic_labels = {
    0: "Somalia & Shabaab News",
    1: "General Islamic Topics",
    2: "Pakistan & Taliban Conflict",
    3: "Israel-Palestine Conflict",
    4: "Iraq Conflict & Attacks",
    5: "Personal/Court Cases",
    6: "Afghanistan Mujahideen Reports",
    7: "Global/Nuclear Weapons",
    8: "Forum Chat & Religious Posts",
    9: "US/Afghanistan Policy (Obama)"
}
print(f"Using these labels: {topic_labels}\n")

# --- 4. Categorize Every Document ---
print("Categorizing all documents...")

categorized_docs = []

try:
    for i in range(len(corpus)):
        original_text = documents[i]

        # --- This is the corrected line ---
        topic_distribution = lda_model[corpus[i]]

        if not topic_distribution:
            continue

        dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0]
        topic_id = dominant_topic[0]
        topic_prob = dominant_topic[1]

        # Get your human-readable label
        topic_label = topic_labels.get(topic_id, "Unknown Topic")

        categorized_docs.append({
            'Original Text': original_text,
            'Category ID': topic_id,
            'Category Label': topic_label,
            'Probability': f"{topic_prob:.2%}"
        })

    print("Categorization complete.")

    # --- 5. Create and Show a DataFrame of the Results ---
    df_results = pd.DataFrame(categorized_docs)

    print("\n--- Sample of Categorized Documents ---")
    # Show the first 5 results with the new labels
    print(df_results.head())

    print("\n--- Category Summary (with your labels) ---")
    # Show how many documents fall into each category
    print(df_results['Category Label'].value_counts())

except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")
    print("Please ensure Step 5 or 6 ran successfully.")

Checking for trained model...
Model found in memory. Proceeding...

--- Topic Keywords (Review) ---
Topic 0: ['somalia', 'somali', 'government', 'mogadishu', 'say', 'shabaab', 'town']
Topic 1: ['islamic', 'people', 'muslims', 'jihad', 'muslim', 'group', 'state']
Topic 2: ['pakistan', 'taliban', 'say', 'attack', 'militant', 'pakistani', 'official']
Topic 3: ['israel', 'israeli', 'hamas', 'gaza', 'pirate', 'palestinian', 'say']
Topic 4: ['kill', 'say', 'police', 'attack', 'soldier', 'wound', 'bomb']
Topic 5: ['say', 'man', 'year', 'tell', 'woman', 'know', 'court']
Topic 6: ['mujahideen', 'kill', 'emirate', 'afghanistan', 'terrorist', 'report', 'islamic']
Topic 7: ['weapon', 'world', 'war', 'nuclear', 'les', 'cia', 'russia']
Topic 8: ['allah', 'brother', 'quote', 'post', 'mujahideen', 'know', 'originally']
Topic 9: ['say', 'afghanistan', 'country', 'force', 'troop', 'government', 'military']

--- Using Your Custom Labels ---
Using these labels: {0: 'Somalia & Shabaab News', 1: 'General Is

In [10]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import warnings

print("Preparing visualization... (this may take a moment)")

# --- 1. Check if Model is in Memory ---
try:
    if 'lda_model' not in locals() or 'corpus' not in locals() or 'dictionary' not in locals():
        raise NameError("model_not_found")
    print("Model, corpus, and dictionary found in memory.")
except NameError:
    print("\n--- ERROR ---")
    print("Could not find 'lda_model', 'corpus', or 'dictionary'.")
    print("Please run Step 5 (Train) or Step 6 (Load) first.")

# --- 2. Enable the visualization for Colab ---
pyLDAvis.enable_notebook()

# --- 3. Prepare the Visualization Data ---
# This formats the data from your model into the chart
# We turn off some common warnings to keep the output clean
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=DeprecationWarning)
    vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

# --- 4. Display the Chart ---
# Simply putting the variable 'vis_data' as the last line
# will make Colab render the interactive chart.
print("Displaying interactive chart...")
vis_data

Preparing visualization... (this may take a moment)
Model, corpus, and dictionary found in memory.
Displaying interactive chart...


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
