## Read Data

In [1]:
import pandas as pd
import datatable as dt
import numpy as np
import re

import nltk
import nltk.corpus  
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.text import Text

import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
import pickle

from joblib import dump, load

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

import matplotlib.pyplot as plt

pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [33]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [34]:
import multiprocessing
from pandarallel import pandarallel

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

workers = num_processors-1

Available CPUs: 8


In [35]:
pandarallel.initialize(nb_workers=num_processors-1, progress_bar=True)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [8]:
%%time
df = pd.read_parquet("news_final_project.parquet")

CPU times: total: 5.25 s
Wall time: 8.29 s


## Data Cleaning Process

I will check the data to see what needs to be done

In [9]:
df.head()

Unnamed: 0,url,date,language,title,text
0,http://galusaustralis.com/2020/02/486473/legal...,2020-02-26,en,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...
1,http://spaceref.com/astronomy/observation-simu...,2021-07-05,en,"Observation, Simulation, And AI Join Forces To...","\n\nObservation, Simulation, And AI Join Force..."
2,http://usweekly.com/news/17/40964/Artificial-i...,2020-02-23,en,Artificial intelligence yields new antibiotic ...,\n\n\nArtificial intelligence yields new antib...
3,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce..."
4,http://www.huewire.com/how-you-should-validate...,2023-07-21,en,How You Should Validate Machine Learning Model...,\n\nHow You Should Validate Machine Learning M...


In [None]:
df.text[0]

I definitely need to get rid of new lines, tabs and links or websites. The code below does that

In [None]:
def clean_text(text):
    import re
    # Remove URLs
    cleaned_text = re.sub(r'(?:\|http?\://|https?\://|www)\S+', '', str(text))
    
    # Remove newline and tab characters
    cleaned_text = re.sub(r'(?:\n|\t)', '', cleaned_text)
    
    return cleaned_text

In [None]:
df['cleaned_text'] = df['text'].parallel_apply(clean_text)

In [None]:
def clean_and_tokenize(text):
    import nltk
    import nltk.corpus  
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    
    stop_words = set(stopwords.words("english"))
    
    tokens = nltk.word_tokenize(text)
    cleaned_toks = [token.lower() for token in tokens if token not in stop_words and len(token) > 1 and token.isalpha()]
    return cleaned_toks

In [None]:
df['tokens'] = df['cleaned_text'].parallel_apply(clean_and_tokenize)

I don't want to have to read the data in and deal with the cleanup each time because it's been taking a while, so I will save this cleaned data and read it in using the datatable library

In [None]:
%%time
df.to_csv('clean_df.csv', index=False)

### Reading clean data in

In [23]:
%%time
data1 = dt.fread('clean_df.csv')

CPU times: total: 1min 25s
Wall time: 29.9 s


In [24]:
%%time
data = data1.to_pandas()

CPU times: total: 8.69 s
Wall time: 35.1 s


In [25]:
data.language.value_counts()

language
en    199677
Name: count, dtype: int64

Check to make sure it looks good

In [7]:
data.shape

(199677, 7)

In [None]:
data.head()

## Lemmatizing

In [8]:
data['tokens'] = data['tokens'].parallel_apply(eval)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=28526), Label(value='0 / 28526')))…

In [9]:
def lemmatization_on_row(row, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    import spacy
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])    
    return [token.lemma_ for token in nlp(" ".join(row)) if token.pos_ in allowed_postags]



In [10]:
data['lemmatized_text'] = data['tokens'].parallel_apply(lemmatization_on_row)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=28526), Label(value='0 / 28526')))…

In [11]:
data.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,tokens,lemmatized_text
0,http://galusaustralis.com/2020/02/486473/legal...,2020-02-26,en,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...,"[legaltech, artificial, intelligence, market, ...","[legaltech, artificial, intelligence, market, ..."
1,http://spaceref.com/astronomy/observation-simu...,2021-07-05,en,"Observation, Simulation, And AI Join Forces To...","\n\nObservation, Simulation, And AI Join Force...","Observation, Simulation, And AI Join Forces To...","[observation, simulation, and, ai, join, force...","[observation, simulation, ai, join, force, rev..."
2,http://usweekly.com/news/17/40964/Artificial-i...,2020-02-23,en,Artificial intelligence yields new antibiotic ...,\n\n\nArtificial intelligence yields new antib...,Artificial intelligence yields new antibiotic ...,"[artificial, intelligence, yields, new, antibi...","[artificial, intelligence, yield, new, antibio..."
3,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...","Forget ML, AI and Industry 4.0 – obsolescence ...","[forget, ml, ai, industry, obsolescence, focus...","[forget, ai, industry, obsolescence, focus, so..."
4,http://www.huewire.com/how-you-should-validate...,2023-07-21,en,How You Should Validate Machine Learning Model...,\n\nHow You Should Validate Machine Learning M...,How You Should Validate Machine Learning Model...,"[how, you, should, validate, machine, learning...","[validate, machine, learning, model, source, p..."


In [12]:
data.to_csv("lem_and_clean.csv", index=False)

## Topic Modeling

In [42]:
%%time
df1 = dt.fread('lem_and_clean.csv')

CPU times: total: 1min 35s
Wall time: 30.4 s


In [43]:
%%time
df = df1.to_pandas()

CPU times: total: 18.5 s
Wall time: 1min 12s


In [7]:
df.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,tokens,lemmatized_text
0,http://galusaustralis.com/2020/02/486473/legal...,2020-02-26,en,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...,"['legaltech', 'artificial', 'intelligence', 'm...","['legaltech', 'artificial', 'intelligence', 'm..."
1,http://spaceref.com/astronomy/observation-simu...,2021-07-05,en,"Observation, Simulation, And AI Join Forces To...","\n\nObservation, Simulation, And AI Join Force...","Observation, Simulation, And AI Join Forces To...","['observation', 'simulation', 'and', 'ai', 'jo...","['observation', 'simulation', 'ai', 'join', 'f..."
2,http://usweekly.com/news/17/40964/Artificial-i...,2020-02-23,en,Artificial intelligence yields new antibiotic ...,\n\n\nArtificial intelligence yields new antib...,Artificial intelligence yields new antibiotic ...,"['artificial', 'intelligence', 'yields', 'new'...","['artificial', 'intelligence', 'yield', 'new',..."
3,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...","Forget ML, AI and Industry 4.0 – obsolescence ...","['forget', 'ml', 'ai', 'industry', 'obsolescen...","['forget', 'ai', 'industry', 'obsolescence', '..."
4,http://www.huewire.com/how-you-should-validate...,2023-07-21,en,How You Should Validate Machine Learning Model...,\n\nHow You Should Validate Machine Learning M...,How You Should Validate Machine Learning Model...,"['how', 'you', 'should', 'validate', 'machine'...","['validate', 'machine', 'learning', 'model', '..."


In [9]:
df['lemmatized_text'] = df['lemmatized_text'].parallel_apply(eval)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=28526), Label(value='0 / 28526')))…

In [10]:
news_list = df['lemmatized_text'].tolist()

In [11]:
nw_dictionary = corpora.Dictionary(news_list)

In [12]:
nw_doc_term_matrix = [nw_dictionary.doc2bow(doc) for doc in news_list]

In [17]:
#with open('news_list.pkl', 'wb') as f:
 #   pickle.dump(news_list, f)

#with open('nw_dictionary.pkl', 'wb') as f:
 #   pickle.dump(nw_dictionary, f)

#with open('nw_doc_term_matrix.pkl', 'wb') as f:
 #   pickle.dump(nw_doc_term_matrix, f)

In [31]:
with open('news_list.pkl', 'rb') as f:
    news_list = pickle.load(f)

with open('nw_dictionary.pkl', 'rb') as f:
    nw_dictionary = pickle.load(f)

with open('nw_doc_term_matrix.pkl', 'rb') as f:
    nw_doc_term_matrix = pickle.load(f)

## 5 topics model

In [23]:
%%time

tw_lda_model = LdaMulticore(corpus=nw_doc_term_matrix,
                   id2word=nw_dictionary,
                   num_topics=5,
                   random_state=100,
                   passes=10,
                   eta='auto',
                   workers=workers)

CPU times: total: 9min 41s
Wall time: 12min 4s


In [30]:
coherence_model_tw_lda = CoherenceModel(model = tw_lda_model, texts=news_list, dictionary=nw_dictionary, coherence='c_v')

In [25]:
print(coherence_model_tw_lda.get_coherence())

0.4183267595940038


In [41]:
tw_lda_model.save("tw_lda_model5")

In [26]:
%%time

lda_display_tw = gensimvis.prepare(tw_lda_model, nw_doc_term_matrix, nw_dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display_tw)

CPU times: total: 1min 33s
Wall time: 2min 42s


## 10 topic model

In [27]:
%%time

tw_lda_model10 = LdaMulticore(corpus=nw_doc_term_matrix,
                   id2word=nw_dictionary,
                   num_topics=10,
                   random_state=100,
                   passes=10,
                   eta='auto',
                   workers=workers)

CPU times: total: 12min 46s
Wall time: 16min 35s


In [42]:
tw_lda_model10.save("tw_lda_model10")

In [31]:
coherence_model_tw_lda10 = CoherenceModel(model = tw_lda_model10, texts=news_list, dictionary=nw_dictionary, coherence='c_v')

In [32]:
print(coherence_model_tw_lda10.get_coherence())

0.45172129248538956


In [33]:
%%time

lda_display_tw10 = gensimvis.prepare(tw_lda_model10, nw_doc_term_matrix, nw_dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display_tw10)

CPU times: total: 1min 47s
Wall time: 3min 28s


## 20 topic model

In [34]:
%%time

tw_lda_model20 = LdaMulticore(corpus=nw_doc_term_matrix,
                   id2word=nw_dictionary,
                   num_topics=20,
                   random_state=100,
                   passes=10,
                   eta='auto',
                   workers=workers)

CPU times: total: 15min 52s
Wall time: 29min 58s


In [38]:
tw_lda_model20.save("tw_lda_model20")

In [35]:
coherence_model_tw_lda20 = CoherenceModel(model = tw_lda_model20, texts=news_list, dictionary=nw_dictionary, coherence='c_v')

In [36]:
print(coherence_model_tw_lda20.get_coherence())

0.4686319033531078


In [37]:
%%time

lda_display_tw20 = gensimvis.prepare(tw_lda_model20, nw_doc_term_matrix, nw_dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display_tw20)

CPU times: total: 1min 52s
Wall time: 6min 6s


## 30 topic model

In [19]:
%%time

tw_lda_model30 = LdaMulticore(corpus=nw_doc_term_matrix,
                   id2word=nw_dictionary,
                   num_topics=30,
                   random_state=100,
                   passes=10,
                   eta='auto',
                   workers=workers)

CPU times: total: 24min 56s
Wall time: 46min 14s


In [21]:
coherence_model_tw_lda30 = CoherenceModel(model = tw_lda_model30, texts=news_list, dictionary=nw_dictionary, coherence='c_v')

In [24]:
tw_lda_model30.save("tw_lda_model30")

In [22]:
score_30 = coherence_model_tw_lda30.get_coherence()

In [23]:
print(score_30)

0.44961567872115077


In [20]:
%%time

lda_display_tw30 = gensimvis.prepare(tw_lda_model30, nw_doc_term_matrix, nw_dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display_tw30)

CPU times: total: 2min 23s
Wall time: 12min 38s


## Reloading a model

In [30]:
tw_lda_model20 = LdaMulticore.load("Models/tw_lda_model20")

In [36]:
%%time

lda_display_tw20 = gensimvis.prepare(tw_lda_model20, nw_doc_term_matrix, nw_dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display_tw20)

CPU times: total: 4min 27s
Wall time: 14min 58s


In [37]:
tw_lda_model10 = LdaMulticore.load("Models/tw_lda_model10")

In [38]:
%%time

lda_display_tw10 = gensimvis.prepare(tw_lda_model10, nw_doc_term_matrix, nw_dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display_tw10)

CPU times: total: 2min 56s
Wall time: 7min 18s


10 topic model worked really well

In [40]:
topic_assignments = tw_lda_model10.get_document_topics(nw_doc_term_matrix)

# Extract the dominant topic for each document
dominant_topics = [max(topics, key=lambda x: x[1])[0] for topics in topic_assignments]


In [18]:
#with open('dominant_topics.pkl', 'wb') as f:
    #pickle.dump(dominant_topics, f)

In [22]:
with open('dominant_topics.pkl', 'rb') as f:
    dominant_topics = pickle.load(f)

In [44]:
df['dominant_topic'] = dominant_topics

In [52]:
df.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,tokens,lemmatized_text,dominant_topic
0,http://galusaustralis.com/2020/02/486473/legal...,2020-02-26,en,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...,"['legaltech', 'artificial', 'intelligence', 'm...","['legaltech', 'artificial', 'intelligence', 'm...",8
1,http://spaceref.com/astronomy/observation-simu...,2021-07-05,en,"Observation, Simulation, And AI Join Forces To...","\n\nObservation, Simulation, And AI Join Force...","Observation, Simulation, And AI Join Forces To...","['observation', 'simulation', 'and', 'ai', 'jo...","['observation', 'simulation', 'ai', 'join', 'f...",9
2,http://usweekly.com/news/17/40964/Artificial-i...,2020-02-23,en,Artificial intelligence yields new antibiotic ...,\n\n\nArtificial intelligence yields new antib...,Artificial intelligence yields new antibiotic ...,"['artificial', 'intelligence', 'yields', 'new'...","['artificial', 'intelligence', 'yield', 'new',...",1
3,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...","Forget ML, AI and Industry 4.0 – obsolescence ...","['forget', 'ml', 'ai', 'industry', 'obsolescen...","['forget', 'ai', 'industry', 'obsolescence', '...",9
4,http://www.huewire.com/how-you-should-validate...,2023-07-21,en,How You Should Validate Machine Learning Model...,\n\nHow You Should Validate Machine Learning M...,How You Should Validate Machine Learning Model...,"['how', 'you', 'should', 'validate', 'machine'...","['validate', 'machine', 'learning', 'model', '...",9


In [51]:
with open('full_df.pkl', 'wb') as f:
    pickle.dump(df, f)