<a href="https://colab.research.google.com/github/xSupernovaa/NLP-Topic-Modelling/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data loading
from google.colab import drive
import zipfile
import pandas as pd
# Preprocessing
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
# Feature extraction
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel


# Util
from tqdm import tqdm
from pprint import pprint



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
DATA_PATH = "drive/MyDrive/articles1.csv.zip"

In [4]:
# Extract the CSV file from the zip archive
with zipfile.ZipFile(DATA_PATH, "r") as zip_ref:
    zip_ref.extractall()

# Load the CSV data into a Pandas DataFrame
df = pd.read_csv("articles1.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [6]:
# Extract title and content as its preprocessing will differ from other features
articles =  df['title'] + ' ' + df['content']

In [7]:
articles[0]

'House Republicans Fret About Winning Their Health Care Suit - The New York Times WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporari

In [8]:
def preprocess_articles(articles):

  lemmatizer = WordNetLemmatizer()

  # Remove punctuations and convert text to lowercase
  words = gensim.utils.simple_preprocess(articles, deacc=True)
    
  # Remove stop words & punctuation 
  words = [word for word in words if word not in stop_words and word not in string.punctuation]
  # Lemmatize
  words = [lemmatizer.lemmatize(word) for word in words]

  # remove all non-word and non-space
  # words = re.sub(r'[^\w\s]', '', words) 

  return words


In [9]:
preprocessed_articles = articles.map(preprocess_articles)

In [10]:
preprocessed_articles[0]

['house',
 'republican',
 'fret',
 'winning',
 'health',
 'care',
 'suit',
 'new',
 'york',
 'time',
 'washington',
 'congressional',
 'republican',
 'new',
 'fear',
 'come',
 'health',
 'care',
 'lawsuit',
 'obama',
 'administration',
 'might',
 'win',
 'incoming',
 'trump',
 'administration',
 'could',
 'choose',
 'longer',
 'defend',
 'executive',
 'branch',
 'suit',
 'challenge',
 'administration',
 'authority',
 'spend',
 'billion',
 'dollar',
 'health',
 'insurance',
 'subsidy',
 'american',
 'handing',
 'house',
 'republican',
 'big',
 'victory',
 'issue',
 'sudden',
 'loss',
 'disputed',
 'subsidy',
 'could',
 'conceivably',
 'cause',
 'health',
 'care',
 'program',
 'implode',
 'leaving',
 'million',
 'people',
 'without',
 'access',
 'health',
 'insurance',
 'republican',
 'prepared',
 'replacement',
 'could',
 'lead',
 'chaos',
 'insurance',
 'market',
 'spur',
 'political',
 'backlash',
 'republican',
 'gain',
 'full',
 'control',
 'government',
 'stave',
 'outcome',
 'repu

In [11]:
def extract_features(articles, method='tf-idf'):
    if method == 'tf-idf':
        vectorizer = TfidfVectorizer()
        feature_vectors = vectorizer.fit_transform(articles)
        feature_names = vectorizer.get_feature_names_out()
    elif method == 'countvectorizer':
        vectorizer = CountVectorizer()
        feature_vectors = vectorizer.fit_transform(articles)
        feature_names = vectorizer.get_feature_names_out()
    else:
        raise ValueError('Invalid method specified.')
    # TODO:
      # Add word2vec
    
    return feature_vectors, feature_names

In [23]:
# number of unique tokens
total_tokens = preprocessed_articles.explode().nunique()
print(total_tokens)

152705


In [24]:
# create Dictionary from preprocessed articles
dictionary = Dictionary(preprocessed_articles)

# filter out tokens that appear too much or too little
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=150000)

# bag of words corpus from dictionary
corpus_bow = [dictionary.doc2bow(article) for article in preprocessed_articles]

In [25]:
# create TF-IDF Model from corpus
tfidf = TfidfModel(corpus_bow)

# weighted tfidf corpus
corpus_tfidf = tfidf[corpus_bow]

In [26]:
# Apply LDA with 12 topics
lda_model = LdaModel(corpus=corpus_bow,
                     id2word=dictionary,
                     num_topics=12,
                     passes=10,
                     iterations=100)

In [27]:
topics = lda_model.show_topics(num_topics=12, num_words=12)
for topic in topics:
    print(topic)

(0, '0.021*"u" + 0.015*"russia" + 0.014*"president" + 0.013*"russian" + 0.012*"official" + 0.010*"china" + 0.009*"intelligence" + 0.009*"house" + 0.009*"north" + 0.008*"cnn" + 0.008*"security" + 0.008*"administration"')
(1, '0.013*"show" + 0.007*"star" + 0.007*"film" + 0.006*"movie" + 0.005*"first" + 0.005*"photo" + 0.004*"cnn" + 0.004*"music" + 0.004*"series" + 0.004*"best" + 0.004*"video" + 0.004*"tv"')
(2, '0.013*"court" + 0.012*"state" + 0.011*"law" + 0.010*"case" + 0.007*"attorney" + 0.007*"justice" + 0.007*"department" + 0.007*"fbi" + 0.007*"investigation" + 0.007*"federal" + 0.005*"judge" + 0.004*"legal"')
(3, '0.015*"isi" + 0.013*"attack" + 0.010*"group" + 0.009*"state" + 0.008*"muslim" + 0.008*"syria" + 0.007*"country" + 0.007*"terrorist" + 0.006*"war" + 0.006*"force" + 0.006*"iraq" + 0.006*"government"')
(4, '0.011*"like" + 0.010*"say" + 0.007*"woman" + 0.006*"know" + 0.006*"get" + 0.006*"think" + 0.006*"thing" + 0.006*"way" + 0.006*"life" + 0.005*"want" + 0.005*"even" + 0.00

In [31]:
# Apply LDA with 20 topics
lda_model_20 = LdaModel(corpus=corpus_bow,
                     id2word=dictionary,
                     num_topics=20,
                     passes=15,
                     iterations=100)

In [34]:
topics = lda_model_20.show_topics(num_topics=20, num_words=15)
for topic in topics:
    print(topic)

(0, '0.029*"police" + 0.016*"officer" + 0.008*"gun" + 0.007*"told" + 0.007*"man" + 0.006*"shooting" + 0.006*"two" + 0.006*"death" + 0.006*"according" + 0.005*"crime" + 0.005*"shot" + 0.005*"prison" + 0.005*"victim" + 0.005*"city" + 0.005*"authority"')
(1, '0.012*"court" + 0.010*"investigation" + 0.009*"president" + 0.009*"fbi" + 0.009*"house" + 0.008*"official" + 0.008*"department" + 0.007*"comey" + 0.007*"email" + 0.007*"former" + 0.007*"case" + 0.007*"justice" + 0.007*"white" + 0.007*"state" + 0.006*"committee"')
(2, '0.028*"trump" + 0.016*"country" + 0.016*"state" + 0.015*"president" + 0.014*"obama" + 0.013*"immigration" + 0.011*"united" + 0.011*"u" + 0.011*"policy" + 0.011*"american" + 0.011*"border" + 0.009*"order" + 0.009*"immigrant" + 0.009*"administration" + 0.008*"trade"')
(3, '0.086*"trump" + 0.034*"clinton" + 0.018*"campaign" + 0.016*"donald" + 0.014*"republican" + 0.012*"hillary" + 0.011*"election" + 0.010*"presidential" + 0.010*"president" + 0.009*"candidate" + 0.009*"stat

In [29]:
# Apply LDA with TF-IDF and 12 topics
lda_model_tfidf = LdaModel(corpus_tfidf,
                           id2word=dictionary,
                           num_topics=12,
                           passes=10,
                           iterations=100)

In [30]:
# Print the top 12 topics
topics = lda_model_tfidf.show_topics(num_topics=12, num_words=15)
for topic in topics:
    print(topic)

(0, '0.008*"police" + 0.005*"isi" + 0.004*"officer" + 0.004*"attack" + 0.003*"syria" + 0.003*"killed" + 0.003*"shooting" + 0.003*"gun" + 0.003*"military" + 0.002*"city" + 0.002*"force" + 0.002*"terrorist" + 0.002*"syrian" + 0.002*"iran" + 0.002*"authority"')
(1, '0.018*"nfl" + 0.008*"quarterback" + 0.008*"brady" + 0.008*"bowl" + 0.007*"game" + 0.006*"patriot" + 0.006*"castile" + 0.005*"player" + 0.005*"football" + 0.005*"season" + 0.005*"touchdown" + 0.005*"bile" + 0.005*"super" + 0.004*"bangladesh" + 0.004*"playoff"')
(2, '0.012*"film" + 0.011*"movie" + 0.008*"star" + 0.007*"music" + 0.007*"microsoft" + 0.006*"song" + 0.006*"netflix" + 0.006*"actor" + 0.006*"award" + 0.005*"hollywood" + 0.004*"season" + 0.004*"comedy" + 0.004*"fisher" + 0.004*"singer" + 0.004*"album"')
(3, '0.006*"startup" + 0.005*"iphone" + 0.004*"water" + 0.004*"food" + 0.003*"storm" + 0.003*"photo" + 0.003*"animal" + 0.002*"space" + 0.002*"car" + 0.002*"hurricane" + 0.002*"duterte" + 0.002*"weather" + 0.002*"micros