# Text Mining - Assignment
Due 7th june by midnight

In [2]:
# Setup for Colab
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab, downloading all the data')
  # download the pre-processed datasets
  ## -nc avoid to download the file if already present, -P is the directory where the file will be placed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/students_preprocessed.pkl.gz -P PreProcessed # students pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/under_20s_preprocessed.pkl.gz -P PreProcessed # under 20s pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/females_preprocessed.pkl.gz -P PreProcessed # females pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/males_preprocessed.pkl.gz -P PreProcessed # males pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/over_20s_preprocessed.pkl.gz -P PreProcessed # over 20s pre-processed
  !wget -nc https://github.com/tommcamm/aut-text-mining/raw/main/Assignment/code/PreProcessed/everyone_preprocessed.pkl.gz -P PreProcessed # Everyone pre-processed

  # Download the test data - two files
  !wget -nc https://raw.githubusercontent.com/tommcamm/aut-text-mining/main/Assignment/code/TestDir/23676.male.33.Technology.Scorpio.xml -P TestDir
  !wget -nc https://raw.githubusercontent.com/tommcamm/aut-text-mining/main/Assignment/code/TestDir/5114.male.25.indUnk.Scorpio.xml -P TestDir
else:
  print('Not running on CoLab, skipping download')
  # For this step I assume the data is already there
  directory_path = './Assignment2BlogData/blogs'

Running on CoLab, downloading all the data
File ‘PreProcessed/students_preprocessed.pkl.gz’ already there; not retrieving.

File ‘PreProcessed/under_20s_preprocessed.pkl.gz’ already there; not retrieving.

File ‘PreProcessed/females_preprocessed.pkl.gz’ already there; not retrieving.

File ‘PreProcessed/males_preprocessed.pkl.gz’ already there; not retrieving.

File ‘PreProcessed/over_20s_preprocessed.pkl.gz’ already there; not retrieving.

File ‘PreProcessed/everyone_preprocessed.pkl.gz’ already there; not retrieving.

File ‘TestDir/23676.male.33.Technology.Scorpio.xml’ already there; not retrieving.

File ‘TestDir/5114.male.25.indUnk.Scorpio.xml’ already there; not retrieving.



## Data cleaning
The following steps will be applied to the dataset to ensure it is cleaned.
1. Remove Non-ASCII Characters: Ensures text is ASCII encoded.
2. Remove Punctuation: Removes any punctuation marks.
3. Lowercase Conversion: Converts all text to lowercase.
4. Remove Stopwords: Removes common stopwords that do not contribute to the meaning of the text.
5. Tokenization: Splits text into individual words.
6. Lemmatization: Reduces words to their base or root form.

In [3]:
import spacy
import re
import nltk
import os
import chardet
import concurrent.futures
from tqdm import tqdm
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# this command must be run before: python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

#spacy.require_gpu() # Ensure is using GPU
#spacy.require_cpu()

# Text pre-processing pipeline
def preprocess_text(text):
    # 1. We remove all XML tags from the document (along with the date)
    text = re.sub(r'<date>.*?</date>', '', text, flags=re.DOTALL)
    text = re.sub(r'<[^>]+>', '', text, flags=re.DOTALL)
    text = re.sub(r'urlLink', '', text, flags=re.DOTALL) # Remove links

    text = text.encode('ascii', 'ignore').decode('ascii') # Remove non ASCII characters
    text = re.sub(r'[^\w\s]', ' ', text) # Remove punctuation


    text = text.lower() # Lowercasing to make it case-insensitive
    tokens = nltk.word_tokenize(text)

    tagged_tokens = nltk.pos_tag(tokens)

    # Remove stopwords and perform lemmatization
    lemmatizer = WordNetLemmatizer()

    # Map POS tag to first character lemmatize() accepts
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ  # adjective
        elif tag.startswith('V'):
            return wordnet.VERB  # verb
        elif tag.startswith('N'):
            return wordnet.NOUN  # noun
        elif tag.startswith('R'):
            return wordnet.ADV  # adverb
        else:
            return wordnet.NOUN  # default to noun

    cleaned_tokens = [
        (lemmatizer.lemmatize(token, get_wordnet_pos(tag)), tag)
        for token, tag in tagged_tokens
        if token not in stopwords.words('english')
    ]

    return cleaned_tokens

# Pre-Process pipeline using spacy for GPU
def preprocess_text_spacy(text):
    # 1. Remove all XML tags from the document (along with the date)
    text = re.sub(r'<date>.*?</date>', '', text, flags=re.DOTALL)
    text = re.sub(r'<[^>]+>', '', text, flags=re.DOTALL)
    text = re.sub(r'urlLink', '', text, flags=re.DOTALL) # Remove links

    # Convert to ASCII and lowercasing to make it case-insensitive
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = text.lower()

    # Process the text with SpaCy
    doc = nlp(text)

    # Remove stopwords and perform lemmatization
    cleaned_tokens = [
        (token.lemma_, token.pos_)
        for token in doc
        if not token.is_stop and token.is_alpha
    ]

    return cleaned_tokens

def process_file(filepath):
    try:
        with open(filepath, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            encoding = result['encoding']
            text = raw_data.decode(encoding)
            cleaned_text = preprocess_text_spacy(text)
            return cleaned_text, None
    except Exception as e:
        return None, (filepath, str(e))

def extract_and_preprocess_text_from_directory(directory_path, filter_func=None):
    text_data = []
    failed_files = []
    filepaths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)]

    if filter_func:
        filepaths = [fp for fp in filepaths if filter_func(fp)]

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_file, filepath): filepath for filepath in filepaths}
        with tqdm(total=len(filepaths), desc="Processing files") as pbar:
            for future in concurrent.futures.as_completed(futures):
                cleaned_text, error = future.result()
                if cleaned_text:
                    text_data.append(cleaned_text)
                else:
                    failed_files.append(error)
                pbar.update(1)
    return text_data, failed_files

# Helper functions
def get_tokens_without_pos(doc):
    """
    Extracts tokens without POS tags from the document.

    :param doc: List of tuples (token, pos_tag)
    :return: List of tokens
    """
    return [token for token, _ in doc]

def get_text_from_tokens(doc):
    """
    Constructs a string from tokens without POS tags.

    :param doc: List of tuples (token, pos_tag)
    :return: String of concatenated tokens
    """
    tokens_only = get_tokens_without_pos(doc)
    return ' '.join(tokens_only)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [13]:
# Test of the pre-processor on one file
test_dir = 'TestDir'
text_data_test, failed_data_test = extract_and_preprocess_text_from_directory(test_dir)

print('\nTEST RESULTS (23676, 5114)')
for doc in text_data_test:
    print("-> ", get_text_from_tokens(doc[:20]))

print('WITH POS TAGS')
for doc in text_data_test:
    print("-> ", doc[:10])

Processing files: 100%|██████████| 2/2 [00:01<00:00,  1.50it/s]


TEST RESULTS (23676, 5114)
->  hello run finally end smooth win static congrat gil box quick sell ni wednesday hold xping goodness pm edt usual
->  slashdot raise lot interesting thought banner ad idea let user control ad delivery allow user comment ad merchant cool frontline
WITH POS TAGS
->  [('hello', 'INTJ'), ('run', 'NOUN'), ('finally', 'ADV'), ('end', 'VERB'), ('smooth', 'ADJ'), ('win', 'NOUN'), ('static', 'NOUN'), ('congrat', 'NOUN'), ('gil', 'NOUN'), ('box', 'NOUN')]
->  [('slashdot', 'NOUN'), ('raise', 'VERB'), ('lot', 'NOUN'), ('interesting', 'ADJ'), ('thought', 'NOUN'), ('banner', 'NOUN'), ('ad', 'NOUN'), ('idea', 'NOUN'), ('let', 'VERB'), ('user', 'NOUN')]





In [5]:
# Filters for the pre-processor

import gzip
import pickle

def filter_everyone(filepath):
    return True

def filter_student(filepath):
    filename = os.path.basename(filepath)
    return '.Student.' in filename

def filter_female(filepath):
    filename = os.path.basename(filepath)
    return '.female.' in filename

def filter_male(filepath):
    filename = os.path.basename(filepath)
    return '.male.' in filename

def filter_age_over_20(filepath):
    filename = os.path.basename(filepath)
    try:
        age = int(filename.split('.')[2])
        return age > 20
    except ValueError:
        return False

def filter_age_under_20(filepath):
    filename = os.path.basename(filepath)
    try:
        age = int(filename.split('.')[2])
        return age <= 20
    except ValueError:
        return False

#  Helper function to work with the pre-processed data
#  (they would be much bigger)
def save_compressed_pickle(data, filename):
    with gzip.open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_compressed_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        return pickle.load(f)

In [None]:
# Pre-Processing block, everything commented to avoid accidental run

# Process "Students"
#text_data_students, failed_files_students = extract_and_preprocess_text_from_directory(directory_path, filter_student)
#save_compressed_pickle(text_data_students, 'students_preprocessed.pkl.gz')

# Process "under 20s"
#text_data_under_20s, failed_files_under_20s = extract_and_preprocess_text_from_directory(directory_path, filter_age_under_20)
#save_compressed_pickle(text_data_under_20s, 'under_20s_preprocessed.pkl.gz')

# Process "males"
#text_data_males, failed_files_males = extract_and_preprocess_text_from_directory(directory_path, filter_male)
#save_compressed_pickle(text_data_males, 'males_preprocessed.pkl.gz')

# Process "females"
#text_data_females, failed_files_females = extract_and_preprocess_text_from_directory(directory_path, filter_female)
#save_compressed_pickle(text_data_females, 'females_preprocessed.pkl.gz')

# Process "Over 20s"
#text_data_over_20s, failed_files_over_20s = extract_and_preprocess_text_from_directory(directory_path, filter_age_over_20)
#save_compressed_pickle(text_data_over_20s, 'over_20s_preprocessed.pkl.gz')

# Process "Everyone" - If RAM >= 30GB && CPU_Cores > 10 --> approx 2h (colab not recommended (cpu_cores = 2))
#text_data_everyone, failed_files_everyone = extract_and_preprocess_text_from_directory(directory_path, filter_everyone)
#save_compressed_pickle(text_data_everyone, 'everyone_preprocessed.pkl.gz')

In [6]:
# Load of all pre-processed files
# WARNING: Uses lot of RAM, High-RAM or only one file at a time recommended.
students_data = load_compressed_pickle('PreProcessed/students_preprocessed.pkl.gz')
under_20s_data = load_compressed_pickle('PreProcessed/under_20s_preprocessed.pkl.gz')
females_data = load_compressed_pickle('PreProcessed/females_preprocessed.pkl.gz')
males_data = load_compressed_pickle('PreProcessed/males_preprocessed.pkl.gz')
over_20s_data = load_compressed_pickle('PreProcessed/over_20s_preprocessed.pkl.gz')
everyone_data = load_compressed_pickle('PreProcessed/everyone_preprocessed.pkl.gz')

# Dict containin
all_preprocessed = {
    "students": students_data,
    "under_20s": under_20s_data,
    "females": females_data,
    "males": males_data,
    "over_20s": over_20s_data,
    "everyone": everyone_data
}

## Topic modeling by counting all types of nouns

The first strategy to extract the 2 most common topics will be by the most prevalent nouns.
During the Pre-Processing we assign to each token a POS tag, the TAG that we use is [Universal POS tag](https://universaldependencies.org/u/pos/).

These tags mark the core part-of-speech categories, by filtering for the `NOUN` tag we capture all nouns types.

In [7]:
from collections import Counter

def get_nouns(doc):
    """
    Extracts nouns from a document.

    :param doc: List of tuples (token, pos_tag)
    :return: List of nouns
    """
    return [word for word, pos in doc if pos.startswith('NOUN')]

noun_counters = {}

# noun counting pipeline
for key, data in all_preprocessed.items():
    nouns = [get_nouns(doc) for doc in data]
    all_nouns = [noun for sublist in nouns for noun in sublist]
    noun_counter = Counter(all_nouns)
    noun_counters[key] = noun_counter

    # Print number of nouns and the most common nouns
    print(f"[{key.capitalize()}] Number of nouns: {len(all_nouns)}")
    most_common_nouns = noun_counter.most_common(10)
    print(f"[{key.capitalize()}] Most Prevalent Topics (Nouns):", most_common_nouns)

# Store the result
save_compressed_pickle(noun_counters, 'noun_counters.pkl.gz')

[Students] Number of nouns: 4892782
[Students] Most Prevalent Topics (Nouns): [('time', 90879), ('day', 74564), ('thing', 72508), ('people', 61680), ('today', 52208), ('friend', 42080), ('life', 41866), ('way', 39479), ('school', 35590), ('year', 34219)]
[Under_20s] Number of nouns: 7130208
[Under_20s] Most Prevalent Topics (Nouns): [('time', 131476), ('day', 112528), ('thing', 108051), ('people', 92919), ('today', 82407), ('friend', 63230), ('life', 60220), ('way', 57889), ('school', 56089), ('guy', 47444)]
[Females] Number of nouns: 11176372
[Females] Most Prevalent Topics (Nouns): [('time', 207850), ('day', 167650), ('thing', 161760), ('people', 131221), ('today', 100886), ('life', 95444), ('friend', 93739), ('way', 92701), ('year', 85481), ('night', 84913)]
[Males] Number of nouns: 11360881
[Males] Most Prevalent Topics (Nouns): [('time', 189011), ('day', 140226), ('people', 134905), ('thing', 133016), ('way', 85280), ('year', 85054), ('today', 82984), ('life', 80592), ('friend', 6

In [8]:
# Noun counting - Part (2) - Clause extraction

# load noun counter
noun_counters = load_compressed_pickle('noun_counters.pkl.gz')

# Function to extract clauses containing the top topics
def extract_clauses(doc, topics, max_clause_len=50):
    """
    Extracts clauses containing the specified topics from a document.

    :param doc: List of tuples (token, pos_tag)
    :param topics: List of top topics (nouns)
    :param max_clause_len: Maximum length of a clause
    :return: List of clauses containing the topics
    """
    clauses = []
    current_clause = []

    for token, pos in doc:
        current_clause.append(token)
        if len(current_clause) >= max_clause_len or pos.startswith('VERB'):
            if any(topic in current_clause for topic in topics):
                clauses.append(' '.join(current_clause))
            current_clause = []

    # Check last clause if it contains any topics
    if any(topic in current_clause for topic in topics):
        clauses.append(' '.join(current_clause))

    return clauses

# Iterate through each demographic to extract clauses with top topics
max_clauses_to_print = 5

for demographic, data in all_preprocessed.items():
    top_topics = [noun for noun, count in noun_counters[demographic].most_common(2)] # Get the two top topics

    clauses_with_topics = [extract_clauses(doc, top_topics) for doc in data]
    all_clauses_with_topics = [clause for sublist in clauses_with_topics for clause in sublist]

    # Extraction
    for topic in top_topics:
        print(f"Clauses containing the topic '{topic}' for {demographic}:")
        count = 0
        for clause in all_clauses_with_topics:
            if topic in clause:
                print(f"- {clause}")
                count += 1
            if count >= max_clauses_to_print:
                break
        print("\n")


Clauses containing the topic 'time' for students:
- precious time beautiful art theatre quit
- time work
- long time page get
- time wxii say
- work chemistry paper time watch


Clauses containing the topic 'day' for students:
- jeep inherently house arrest oh lovely day outside dog walk bike ride read
- meal day go
- physical exertion long strenuous day err stuff kill
- computer game day collect
- violent weapon customary gift valentine day complain


Clauses containing the topic 'time' for under_20s:
- precious time beautiful art theatre quit
- time work
- long time page get
- time wxii say
- work chemistry paper time watch


Clauses containing the topic 'day' for under_20s:
- jeep inherently house arrest oh lovely day outside dog walk bike ride read
- meal day go
- physical exertion long strenuous day err stuff kill
- computer game day collect
- violent weapon customary gift valentine day complain


Clauses containing the topic 'time' for females:
- good time lot name past funny bar

## Topic modeling with Latent Dirichlet Allocation (LDA)

In [None]:
import gensim
from gensim import corpora

# Topic modeling with LDA - Students
students_tokens = [get_tokens_without_pos(doc) for doc in students_data]

# Dictionary representation of all docs
id2word = corpora.Dictionary(students_tokens)

# Corpus representation of all docs
corpus = [id2word.doc2bow(text) for text in students_tokens]

lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=2,
    random_state=100,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

print(lda_model.print_topics())

[(0, '0.014*"like" + 0.012*"go" + 0.009*"get" + 0.009*"know" + 0.008*"think" + 0.008*"good" + 0.007*"time" + 0.006*"want" + 0.006*"day" + 0.006*"thing"'), (1, '0.004*"n" + 0.004*"time" + 0.004*"war" + 0.003*"haha" + 0.003*"u" + 0.003*"bush" + 0.003*"man" + 0.002*"den" + 0.002*"study" + 0.002*"e"')]


In [None]:
import spacy

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Increase the max_length limit of SpaCy
nlp.max_length = 1500000

# Function to split text into chunks
def split_text_into_chunks(text, max_chunk_size=1000000):
    return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

# Extract tokens from pre-processed students_data
def extract_tokens(documents):
    return [[token for token, _ in doc] for doc in documents]

documents = extract_tokens(students_data)

# Create a dictionary and corpus
id2word = corpora.Dictionary(documents)
corpus = [id2word.doc2bow(text) for text in documents]

# Build the LDA model using multicore processing
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=2,
    random_state=100,
    chunksize=100,
    passes=10,
    per_word_topics=True
)

# Get the top terms for each topic
top_terms = lda_model.show_topics(num_words=5)
topics = [term for topic in top_terms for term, _ in lda_model.show_topic(topic[0], topn=5)]
print("Top Terms:", topics)

def extract_sentences_with_topics(text, topics):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents if any(topic in sent.text for topic in topics)]
    return sentences

# Combine all documents into a single text for sentence extraction
combined_text = " ".join([" ".join(doc) for doc in documents])

# Split the combined text into smaller chunks
text_chunks = split_text_into_chunks(combined_text)

# Extract sentences with topics from each chunk
all_sentences_with_topics = []
for chunk in text_chunks:
    sentences_with_topics = extract_sentences_with_topics(chunk, topics)
    all_sentences_with_topics.extend(sentences_with_topics)

print("Total sentences with topics:", len(all_sentences_with_topics))
for sentence in all_sentences_with_topics:
    print(sentence)

Top Terms: ['like', 'go', 'love', 'know', 'get', 'like', 'go', 'think', 'time', 'get']


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

