# Lab 2 - Text Mining - Pre Processing 

In [23]:
# Libraries import
import os
import string
import nltk
from nltk.util import bigrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import trigrams
from collections import Counter

In [11]:
# Required NTLK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tommc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tommc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tommc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Task 1-6

In [7]:
# Read of all files inside the directory
directory_path = './terrorism_articles'

articles = []

for filename in os.listdir(directory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            articles.append(file.read())

In [10]:
# Find all of the bi-grams
all_bigrams = []

for article in articles:
    tokens = word_tokenize(article)
    article_bigrams = list(bigrams(tokens))

    all_bigrams.extend(article_bigrams)

## Task 7

In [18]:
# (a) - Text cleaning
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) # Retrieve english stopwords

## article word-cleaning function
def clean_text(c_article):
    # Tokenize the article
    c_tokens = word_tokenize(c_article)
    # Remove punctuation and convert to lowercase
    c_tokens = [word.lower() for word in c_tokens if word.isalnum()]
    # Remove stop-words
    c_tokens = [word for word in c_tokens if word not in stop_words]
    # Lemmatize the words 
    c_tokens = [lemmatizer.lemmatize(word) for word in c_tokens]
    return c_tokens

cleaned_articles = [clean_text(article) for article in articles]

In [22]:
# (b) - Find trigrams 
all_trigrams = []

for c_article in cleaned_articles:
    article_trigrams = list(trigrams(c_article))
    all_trigrams.extend(article_trigrams)
    
all_trigrams

[('celebrating', 'last', 'week'),
 ('last', 'week', 'victory'),
 ('week', 'victory', 'iraqi'),
 ('victory', 'iraqi', 'government'),
 ('iraqi', 'government', 'force'),
 ('government', 'force', 'islamic'),
 ('force', 'islamic', 'state'),
 ('islamic', 'state', 'terrorist'),
 ('state', 'terrorist', 'group'),
 ('terrorist', 'group', 'still'),
 ('group', 'still', 'feel'),
 ('still', 'feel', 'little'),
 ('feel', 'little', 'premature'),
 ('little', 'premature', 'yes'),
 ('premature', 'yes', 'retaking'),
 ('yes', 'retaking', 'provincial'),
 ('retaking', 'provincial', 'capital'),
 ('provincial', 'capital', 'ramadi'),
 ('capital', 'ramadi', 'largest'),
 ('ramadi', 'largest', 'city'),
 ('largest', 'city', 'recaptured'),
 ('city', 'recaptured', 'far'),
 ('recaptured', 'far', 'best'),
 ('far', 'best', 'news'),
 ('best', 'news', 'yet'),
 ('news', 'yet', 'region'),
 ('yet', 'region', 'news'),
 ('region', 'news', 'awful'),
 ('news', 'awful', 'tempting'),
 ('awful', 'tempting', 'conclude'),
 ('tempting'

In [24]:
# (c) - most common trigram
trigram_counts = Counter(all_trigrams)
most_common_trigram = trigram_counts.most_common(1)
print(f"The most common trigram is: {most_common_trigram[0][0]} with {most_common_trigram[0][1]} occurrences")

The most common trigram is: ('new', 'york', 'time') with 2042 occurrences


The most common trigram suggests that many articles present in the dataset referenced the NYT (New York Times), a strategy to extract the most common topic could be using the LDA model.

The LDA model find for each document a various number of topics, the topic is characterized by a distribution over words.

In [25]:
# (d) - extract most common topic from corpus
from gensim import corpora, models
import gensim

dictionary = corpora.Dictionary(cleaned_articles)
corpus = [dictionary.doc2bow(text) for text in cleaned_articles]

# Apply LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)


(0, '0.020*"said" + 0.014*"attack" + 0.012*"police" + 0.007*"people"')
(1, '0.016*"said" + 0.013*"state" + 0.009*"force" + 0.008*"military"')
(2, '0.017*"state" + 0.015*"said" + 0.013*"islamic" + 0.010*"group"')
(3, '0.015*"said" + 0.006*"people" + 0.006*"muslim" + 0.006*"attack"')
(4, '0.014*"said" + 0.007*"apple" + 0.007*"government" + 0.006*"company"')
