# Text Mining - Final Report - Topic Extraction/Modeling
Due 7th june by midnight

In [1]:
directory_path = ''
# Setup for Colab
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab, downloading all the data')
  # Download the full dataset
  # !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/Dataset/Assignment2BlogData.zip -P Dataset # full dataset
  # !unzip ./Dataset/Assignment2BlogData.zip -d ./Dataset -q # unzip the dataset
  # directory_path = './Dataset/blogs'

  # download the pre-processed datasets
  ## -nc avoid to download the file if already present, -P is the directory where the file will be placed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/students_preprocessed.pkl.gz -P PreProcessed # students pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/under_20s_preprocessed.pkl.gz -P PreProcessed # under 20s pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/females_preprocessed.pkl.gz -P PreProcessed # females pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/males_preprocessed.pkl.gz -P PreProcessed # males pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/over_20s_preprocessed.pkl.gz -P PreProcessed # over 20s pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/everyone_preprocessed.pkl.gz -P PreProcessed # Everyone pre-processed

  # Download the test data - two files
  !wget -nc https://raw.githubusercontent.com/tommcamm/aut-text-mining/main/Assignment/code/TestDir/23676.male.33.Technology.Scorpio.xml -P TestDir
  !wget -nc https://raw.githubusercontent.com/tommcamm/aut-text-mining/main/Assignment/code/TestDir/5114.male.25.indUnk.Scorpio.xml -P TestDir

  !pip install bertopic
else:
  print('Not running on CoLab, skipping download')
  # For this step I assume the data is already there
  directory_path = './Assignment2BlogData/blogs'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./Dataset/blogs/4017553.male.27.indUnk.Sagittarius.xml  
  inflating: ./Dataset/blogs/4017562.female.14.Arts.Virgo.xml  
  inflating: ./Dataset/blogs/4017734.male.41.Architecture.Aries.xml  
  inflating: ./Dataset/blogs/4018091.female.16.Student.Sagittarius.xml  
  inflating: ./Dataset/blogs/4018129.male.24.Technology.Virgo.xml  
  inflating: ./Dataset/blogs/4018252.male.23.RealEstate.Pisces.xml  
  inflating: ./Dataset/blogs/4018365.male.16.Student.Capricorn.xml  
  inflating: ./Dataset/blogs/4018435.female.17.Technology.Libra.xml  
  inflating: ./Dataset/blogs/4018447.female.26.Arts.Cancer.xml  
  inflating: ./Dataset/blogs/4018473.female.17.Student.Leo.xml  
  inflating: ./Dataset/blogs/4018510.female.26.indUnk.Sagittarius.xml  
  inflating: ./Dataset/blogs/4018635.male.27.Education.Libra.xml  
  inflating: ./Dataset/blogs/4018637.male.16.Student.Gemini.xml  
  inflating: ./Dataset/blogs/4018669.female.34.

## Data cleaning
The following steps will be applied to the dataset to ensure it is cleaned.
1. Remove Non-ASCII Characters: Ensures text is ASCII encoded.
2. Remove Punctuation: Removes any punctuation marks.
3. Lowercase Conversion: Converts all text to lowercase.
4. Remove Stopwords: Removes common stopwords that do not contribute to the meaning of the text.
5. Tokenization: Splits text into individual words.
6. Lemmatization: Reduces words to their base or root form.

In [5]:
import spacy
import re
import nltk
import os
import chardet
import concurrent.futures
from tqdm import tqdm
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# this command must be run before: python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

#spacy.require_gpu() # Ensure is using GPU
#spacy.require_cpu()

# Basic Text Pre-Processing pipeline - Useful for NTMs
def preprocess_text_basic(text):
    # 1. Remove all XML tags from the document (along with the date)
    text = re.sub(r'<date>.*?</date>', '', text, flags=re.DOTALL) # Remove dates
    text = re.sub(r'<[^>]+>', '', text, flags=re.DOTALL) # Remove HTML tags
    text = re.sub(r'urlLink', '', text, flags=re.DOTALL) # Remove urlLink(s)

    # Convert to ASCII and lowercasing to make it case-insensitive
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = text.lower()

    return text

# Pre-Process pipeline using spacy for GPU
def preprocess_text_spacy(text):
  text = preprocess_text_basic(text)

  # Process the text with SpaCy
  doc = nlp(text)

  # Remove stopwords and perform lemmatization
  cleaned_tokens = [
      (token.lemma_, token.pos_)
      for token in doc
      if not token.is_stop and (token.is_alpha or token.is_punct)
  ]

  return cleaned_tokens


def process_file(filepath):
    try:
        with open(filepath, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            encoding = result['encoding']
            text = raw_data.decode(encoding)
            cleaned_text = preprocess_text_spacy(text)
            return cleaned_text, None
    except Exception as e:
        return None, (filepath, str(e))

def process_file_basic(filepath):
    try:
        with open(filepath, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            encoding = result['encoding']
            text = raw_data.decode(encoding)
            cleaned_text = preprocess_text_basic(text)
            return cleaned_text, None
    except Exception as e:
        return None, (filepath, str(e))

def extract_and_preprocess_text_from_directory(directory_path, filter_func=None):
    text_data = []
    failed_files = []
    filepaths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)]

    if filter_func:
        filepaths = [fp for fp in filepaths if filter_func(fp)]

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_file, filepath): filepath for filepath in filepaths}
        with tqdm(total=len(filepaths), desc="Processing files") as pbar:
            for future in concurrent.futures.as_completed(futures):
                cleaned_text, error = future.result()
                if cleaned_text:
                    text_data.append(cleaned_text)
                else:
                    failed_files.append(error)
                pbar.update(1)
    return text_data, failed_files

def extract_and_preprocess_text_from_directory_basic(directory_path, filter_func=None):
    text_data = []
    failed_files = []
    filepaths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)]

    if filter_func:
        filepaths = [fp for fp in filepaths if filter_func(fp)]

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_file_basic, filepath): filepath for filepath in filepaths}
        with tqdm(total=len(filepaths), desc="Processing files") as pbar:
            for future in concurrent.futures.as_completed(futures):
                cleaned_text, error = future.result()
                if cleaned_text:
                    text_data.append(cleaned_text)
                else:
                    failed_files.append(error)
                pbar.update(1)
    return text_data, failed_files

# Helper functions
def get_tokens_without_pos(doc):
    """
    Extracts tokens without POS tags from the document.

    :param doc: List of tuples (token, pos_tag)
    :return: List of tokens
    """
    return [token for token, _ in doc]

def get_text_from_tokens(doc):
    """
    Constructs a string from tokens without POS tags.

    :param doc: List of tuples (token, pos_tag)
    :return: String of concatenated tokens
    """
    tokens_only = get_tokens_without_pos(doc)
    return ' '.join(tokens_only)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
# Test of the pre-processor on one file
test_dir = 'TestDir'
text_data_test, failed_data_test = extract_and_preprocess_text_from_directory(test_dir)
text_data_basic, failed_basic = extract_and_preprocess_text_from_directory_basic(test_dir)

print('\nTEST RESULTS (23676, 5114)')
for doc in text_data_test:
    print("-> ", get_text_from_tokens(doc[:20]))

print('WITH POS TAGS')
for doc in text_data_test:
    print("-> ", doc[:10])

Processing files: 100%|██████████| 2/2 [00:01<00:00,  1.21it/s]
Processing files: 100%|██████████| 2/2 [00:00<00:00,  5.74it/s]


TEST RESULTS (23676, 5114)
->  hello ! run finally end smooth win static . congrat , gil box quick sell ni . wednesday hold xping
->  slashdot raise lot interesting thought banner ad . idea let user control ad delivery , allow user comment ad .
WITH POS TAGS
->  [('hello', 'INTJ'), ('!', 'PUNCT'), ('run', 'NOUN'), ('finally', 'ADV'), ('end', 'VERB'), ('smooth', 'ADJ'), ('win', 'NOUN'), ('static', 'NOUN'), ('.', 'PUNCT'), ('congrat', 'NOUN')]
->  [('slashdot', 'NOUN'), ('raise', 'VERB'), ('lot', 'NOUN'), ('interesting', 'ADJ'), ('thought', 'NOUN'), ('banner', 'NOUN'), ('ad', 'NOUN'), ('.', 'PUNCT'), ('idea', 'NOUN'), ('let', 'VERB')]





In [None]:
# Filters for the pre-processor

import gzip
import pickle

def filter_everyone(filepath):
    return True

def filter_student(filepath):
    filename = os.path.basename(filepath)
    return '.Student.' in filename

def filter_female(filepath):
    filename = os.path.basename(filepath)
    return '.female.' in filename

def filter_male(filepath):
    filename = os.path.basename(filepath)
    return '.male.' in filename

def filter_age_over_20(filepath):
    filename = os.path.basename(filepath)
    try:
        age = int(filename.split('.')[2])
        return age > 20
    except ValueError:
        return False

def filter_age_under_20(filepath):
    filename = os.path.basename(filepath)
    try:
        age = int(filename.split('.')[2])
        return age <= 20
    except ValueError:
        return False

#  Helper function to work with the pre-processed data
#  (they would be much bigger)
def save_compressed_pickle(data, filename):
    with gzip.open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_compressed_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        return pickle.load(f)

In [None]:
# Pre-Processing block, everything commented to avoid accidental run

# Process "Students"
#text_data_students, failed_files_students = extract_and_preprocess_text_from_directory(directory_path, filter_student)
#save_compressed_pickle(text_data_students, 'students_preprocessed.pkl.gz')

# Process "under 20s"
#text_data_under_20s, failed_files_under_20s = extract_and_preprocess_text_from_directory(directory_path, filter_age_under_20)
#save_compressed_pickle(text_data_under_20s, 'under_20s_preprocessed.pkl.gz')

# Process "males"
#text_data_males, failed_files_males = extract_and_preprocess_text_from_directory(directory_path, filter_male)
#save_compressed_pickle(text_data_males, 'males_preprocessed.pkl.gz')

# Process "females"
#text_data_females, failed_files_females = extract_and_preprocess_text_from_directory(directory_path, filter_female)
#save_compressed_pickle(text_data_females, 'females_preprocessed.pkl.gz')

# Process "Over 20s"
#text_data_over_20s, failed_files_over_20s = extract_and_preprocess_text_from_directory(directory_path, filter_age_over_20)
#save_compressed_pickle(text_data_over_20s, 'over_20s_preprocessed.pkl.gz')

# Process "Everyone" - If RAM >= 30GB && CPU_Cores > 10 --> approx 2h (colab not recommended (cpu_cores = 2))
#text_data_everyone, failed_files_everyone = extract_and_preprocess_text_from_directory(directory_path, filter_everyone)
#save_compressed_pickle(text_data_everyone, 'everyone_preprocessed.pkl.gz')

In [None]:
# Load of all pre-processed files
# WARNING: Uses lot of RAM, High-RAM or only one file at a time recommended.
students_data = load_compressed_pickle('PreProcessed/students_preprocessed.pkl.gz')
under_20s_data = load_compressed_pickle('PreProcessed/under_20s_preprocessed.pkl.gz')
females_data = load_compressed_pickle('PreProcessed/females_preprocessed.pkl.gz')
males_data = load_compressed_pickle('PreProcessed/males_preprocessed.pkl.gz')
over_20s_data = load_compressed_pickle('PreProcessed/over_20s_preprocessed.pkl.gz')
everyone_data = load_compressed_pickle('PreProcessed/everyone_preprocessed.pkl.gz')

# Dict containin
all_preprocessed = {
    "students": students_data,
    "under_20s": under_20s_data,
    "females": females_data,
    "males": males_data,
    "over_20s": over_20s_data,
    "everyone": everyone_data
}

## Topic modeling by counting all types of nouns

The first strategy to extract the 2 most common topics will be by the most prevalent nouns.
During the Pre-Processing we assign to each token a POS tag, the TAG that we use is [Universal POS tag](https://universaldependencies.org/u/pos/).

These tags mark the core part-of-speech categories, by filtering for the `NOUN` tag we capture all nouns types.

In [None]:
from collections import Counter

def get_nouns(doc):
    """
    Extracts nouns from a document.

    :param doc: List of tuples (token, pos_tag)
    :return: List of nouns
    """
    return [word for word, pos in doc if pos.startswith('NOUN')]

noun_counters = {}

# noun counting pipeline
for key, data in all_preprocessed.items():
    nouns = [get_nouns(doc) for doc in data]
    all_nouns = [noun for sublist in nouns for noun in sublist]
    noun_counter = Counter(all_nouns)
    noun_counters[key] = noun_counter

    # Print number of nouns and the most common nouns
    print(f"[{key.capitalize()}] Number of nouns: {len(all_nouns)}")
    most_common_nouns = noun_counter.most_common(10)
    print(f"[{key.capitalize()}] Most Prevalent Topics (Nouns):", most_common_nouns)

# Store the result
save_compressed_pickle(noun_counters, 'noun_counters.pkl.gz')

[Students] Number of nouns: 4892782
[Students] Most Prevalent Topics (Nouns): [('time', 90879), ('day', 74564), ('thing', 72508), ('people', 61680), ('today', 52208), ('friend', 42080), ('life', 41866), ('way', 39479), ('school', 35590), ('year', 34219)]
[Under_20s] Number of nouns: 7130208
[Under_20s] Most Prevalent Topics (Nouns): [('time', 131476), ('day', 112528), ('thing', 108051), ('people', 92919), ('today', 82407), ('friend', 63230), ('life', 60220), ('way', 57889), ('school', 56089), ('guy', 47444)]
[Females] Number of nouns: 11176372
[Females] Most Prevalent Topics (Nouns): [('time', 207850), ('day', 167650), ('thing', 161760), ('people', 131221), ('today', 100886), ('life', 95444), ('friend', 93739), ('way', 92701), ('year', 85481), ('night', 84913)]
[Males] Number of nouns: 11360881
[Males] Most Prevalent Topics (Nouns): [('time', 189011), ('day', 140226), ('people', 134905), ('thing', 133016), ('way', 85280), ('year', 85054), ('today', 82984), ('life', 80592), ('friend', 6

In [None]:
# Noun counting - Part (2) - Clause extraction

# load noun counter
noun_counters = load_compressed_pickle('noun_counters.pkl.gz')

# Function to extract clauses containing the top topics
def extract_clauses(doc, topics, max_clause_len=50):
    """
    Extracts clauses containing the specified topics from a document.

    :param doc: List of tuples (token, pos_tag)
    :param topics: List of top topics (nouns)
    :param max_clause_len: Maximum length of a clause
    :return: List of clauses containing the topics
    """
    clauses = []
    current_clause = []

    for token, pos in doc:
        current_clause.append(token)
        if len(current_clause) >= max_clause_len or pos.startswith('VERB'):
            if any(topic in current_clause for topic in topics):
                clauses.append(' '.join(current_clause))
            current_clause = []

    # Check last clause if it contains any topics
    if any(topic in current_clause for topic in topics):
        clauses.append(' '.join(current_clause))

    return clauses

# Iterate through each demographic to extract clauses with top topics
max_clauses_to_print = 5

for demographic, data in all_preprocessed.items():
    top_topics = [noun for noun, count in noun_counters[demographic].most_common(2)] # Get the two top topics

    clauses_with_topics = [extract_clauses(doc, top_topics) for doc in data]
    all_clauses_with_topics = [clause for sublist in clauses_with_topics for clause in sublist]

    # Extraction
    for topic in top_topics:
        print(f"Clauses containing the topic '{topic}' for {demographic}:")
        count = 0
        for clause in all_clauses_with_topics:
            if topic in clause:
                print(f"- {clause}")
                count += 1
            if count >= max_clauses_to_print:
                break
        print("\n")


Clauses containing the topic 'time' for students:
- precious time beautiful art theatre quit
- time work
- long time page get
- time wxii say
- work chemistry paper time watch


Clauses containing the topic 'day' for students:
- jeep inherently house arrest oh lovely day outside dog walk bike ride read
- meal day go
- physical exertion long strenuous day err stuff kill
- computer game day collect
- violent weapon customary gift valentine day complain


Clauses containing the topic 'time' for under_20s:
- precious time beautiful art theatre quit
- time work
- long time page get
- time wxii say
- work chemistry paper time watch


Clauses containing the topic 'day' for under_20s:
- jeep inherently house arrest oh lovely day outside dog walk bike ride read
- meal day go
- physical exertion long strenuous day err stuff kill
- computer game day collect
- violent weapon customary gift valentine day complain


Clauses containing the topic 'time' for females:
- good time lot name past funny bar

## Topic modeling with Latent Dirichlet Allocation (LDA)

this LDA model uses the basic Bag-of-word method as vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Function to convert documents to a format suitable for CountVectorizer
def docs_to_strings(docs):
    """
    Converts a list of documents with (token, pos_tag) tuples into a list of strings.

    :param docs: List of documents, each document is a list of (token, pos_tag) tuples
    :return: List of strings, each string is a document
    """
    return [' '.join([word for word, pos in doc]) for doc in docs]

# Prepare the data for each demographic
docs_strings = {key: docs_to_strings(data) for key, data in all_preprocessed.items()}

# Function to perform topic modeling
def perform_topic_modeling(documents, n_topics=2):
    """
    Performs topic modeling on the provided documents using LDA.

    :param documents: List of strings, each string is a document
    :param n_topics: Number of topics to identify
    :return: LDA model and the feature names (vocabulary)
    """
    # Create a CountVectorizer to convert the text data to a Bag-of-Words model
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(documents)

    # Fit the LDA model
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(X)

    # Get the feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    return lda, feature_names

# Perform topic modeling for each demographic
for demographic, docs in docs_strings.items():
    print(f"Topic modeling for {demographic}:")
    lda_model, feature_names = perform_topic_modeling(docs, n_topics=2)

    # Display the top words for each topic
    for topic_idx, topic in enumerate(lda_model.components_):
        print(f"Topic #{topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))  # Top 10 words
    print("\n")


Topic modeling for students:
Topic #1:
not go like get love lol think time know haha
Topic #2:
like go know think get time good thing day people


Topic modeling for under_20s:
Topic #1:
like go know think get good time want thing day
Topic #2:
go like not get haha lol time day today den


Topic modeling for females:
Topic #1:
not like go get love know lol think time haha
Topic #2:
like go know think time get good want day thing


Topic modeling for males:
Topic #1:
people time like say think year new good know work
Topic #2:
go like get know time think good day thing come


Topic modeling for over_20s:
Topic #1:
people time like know say think life god good year
Topic #2:
like go time know think get good day thing work


Topic modeling for everyone:
Topic #1:
people time like say year new good think know come
Topic #2:
like go know think get time good day thing want




## NTM (Neural Topic Modeling)

Neural topic modeling can leverage GPU architecture to work on a very big collecton of data, Neural Variational Document Model (NVDM) or the more recent BERT-based topic modeling methods.

In [None]:
# Import necessary libraries
from bertopic import BERTopic

# Function to convert documents to a format suitable for BERTopic
def docs_to_strings(docs):
    """
    Converts a list of documents with (token, pos_tag) tuples into a list of strings.

    :param docs: List of documents, each document is a list of (token, pos_tag) tuples
    :return: List of strings, each string is a document
    """
    return [' '.join([word for word, pos in doc]) for doc in docs]

# Prepare the data for each demographic
docs_strings = {key: docs_to_strings(data) for key, data in all_preprocessed.items()}

# Function to perform neural topic modeling using BERTopic
def perform_neural_topic_modeling(documents):
    """
    Performs neural topic modeling on the provided documents using BERTopic.

    :param documents: List of strings, each string is a document
    :return: BERTopic model and the generated topics
    """
    # Initialize BERTopic
    topic_model = BERTopic()

    # Fit the model on the documents
    topics, probs = topic_model.fit_transform(documents)

    return topic_model, topics

# Perform topic modeling for each demographic
for demographic, docs in docs_strings.items():
    print(f"Neural topic modeling for {demographic}:")
    topic_model, topics = perform_neural_topic_modeling(docs)

    # Display the topics
    print(topic_model.get_topic_info())
    print("\n")

    # Display the top words for each topic
    for topic_idx in topic_model.get_topics():
        if topic_idx == -1:  # Skip the outlier topic
            continue
        topic = topic_model.get_topic(topic_idx)
        print(f"Topic #{topic_idx}:")
        print(" ".join([word for word, _ in topic]))  # Top words
    print("\n")


Neural topic modeling for students:


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

    Topic  Count                                Name  \
0      -1   3084                 -1_go_like_know_get   
1       0    247               0_love_know_feel_life   
2       1    228                    1_den_haha_de_mi   
3       2    122               2_not_like_think_know   
4       3    118               3_bush_kerry_war_iraq   
5       4    108                4_new_like_time_post   
6       5    108                5_kirsten_oh_love_go   
7       6    100               6_song_band_like_good   
8       7     75               7_god_know_think_life   
9       8     73                 8_josh_not_like_lol   
10      9     62                9_game_post_new_blog   
11     10     54               10_go_get_summer_like   
12     11     41           11_roskilly_jon_love_know   
13     12     38              12_go_get_school_today   
14     13     36                     13_na_que_ko_sa   
15     14     35         14_boinking_love_boinke_say   
16     15     33               15_go_get_friend_

  pid = os.fork()


   Topic  Count                       Name  \
0     -1     17  -1_think_people_book_read   
1      0  19195       0_like_go_know_think   
2      1     56  1_yarn_knit_like_knitting   

                                      Representation  \
0  [think, people, book, read, ap, like, rob, kno...   
1  [like, go, know, think, time, get, good, day, ...   
2  [yarn, knit, like, knitting, stitch, pattern, ...   

                                 Representative_Docs  
0  [hello start netnav class feel free read posti...  
1  [nbsp nbsp tattooed bettie tattooed bettie dar...  
2  [stumble new addiction knitting thing bag maki...  


Topic #0:
like go know think time get good day thing want
Topic #1:
yarn knit like knitting stitch pattern go think work get


