In this assignment - we're going to run an NLP pipeline from end to end:

* Either use the csv file provided for all wikipedia pages under the category "natural language processing" or generate your own here: https://petscan.wmflabs.org/ (non-comprehensive list of settings below)
  * Page properties - set re-directs to 'No'
  * Output - set to 'csv' 
* Download all the pages - extract the paragraph text into a pandas dataframe
* Run LSA analysis to get main topics
* Repeat with LDA
* Finally - generate & refine embeddings

---

*Fill in the missing code from the relevant sections below, missing code is indicated by \<FILL_CODE>*

*IMPORTANT: Make sure you include the outputs or printouts for every cell in the .ipynb file that you upload.*

## Load / Import data

### 1. Get the text data using either the csv file provided or the one you generated (2 points)

For the rest of this assignment - continue to use the dataset you selected here

In [16]:
import pandas as pd

wiki_page_corpus = pd.read_csv('./wikipedia-nlp-pages.csv')
print(wiki_page_corpus[:5])

   number                  title  pageid  namespace  length         touched
0       1                  ELIZA   10235        NaN   29186  20220910110802
1       2   Finite-state_machine   10931        NaN   41876  20220918155749
2       3               HAL_9000   14384        NaN   35794  20220727001747
3       4  Information_retrieval   15271        NaN   27625  20220904163734
4       5            Kleene_star   16750        NaN    7210  20220913212929


In [17]:
wiki_page_corpus.shape

(866, 6)

### 2. Preprocess text into tokens (1 point)

In [18]:
import re
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
custom_stopwords = ['displaystyle']

def preprocess_tokenize_with_gensim(text):
  # convert to lowercase, remove extra whitespace
  text = text.lower().strip()
  # remove \n, strip symbols and punctuation 
  text = strip_punctuation(text)
  # remove stopwords
  text = remove_stopwords(text)
  # tokenize
  tokens = list(tokenize(text))
  # remove any additional stopwords if needed (this is a custom extra step if 
  # you see words in the topics that don't belong)
  # remove any words shorter than 2 characters
  tokens = [token for token in tokens if token not in custom_stopwords and len(token) > 2]
  return tokens

# create tokens column
wiki_page_corpus['paragraph_tokens'] = wiki_page_corpus['title'].apply(preprocess_tokenize_with_gensim)
wiki_page_corpus.head()

Unnamed: 0,number,title,pageid,namespace,length,touched,paragraph_tokens
0,1,ELIZA,10235,,29186,20220910110802,[eliza]
1,2,Finite-state_machine,10931,,41876,20220918155749,"[finite, state, machine]"
2,3,HAL_9000,14384,,35794,20220727001747,[hal]
3,4,Information_retrieval,15271,,27625,20220904163734,"[information, retrieval]"
4,5,Kleene_star,16750,,7210,20220913212929,"[kleene, star]"


## Run LSA with TFIDF

In [19]:
import os.path
from gensim.corpora import Dictionary
from gensim.models import LsiModel, TfidfModel
import re
import warnings
warnings.filterwarnings('ignore')

### 3. Generate dictionary and document-term-matrix needed for computing TFIDF(1 point) 

In [20]:
dictionary = Dictionary(wiki_page_corpus['paragraph_tokens'])
corpus = [dictionary.doc2bow(text) for text in wiki_page_corpus['paragraph_tokens']]
tfidf = TfidfModel(corpus)


### 4. Specify n_topics (at least 7), compute TFIDF matrix and train LSA model (1 point)

In [21]:
n_topics = 10
tfidf_corpus = tfidf[corpus]
lsi = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=n_topics)

### 5. Print top 7 topics and 7 words for every topic (1 point)

In [22]:
# display top 7 topics and top 7 words for each topic
for topic in lsi.show_topics(num_topics=7, num_words=7, formatted=False):
  print(topic)



(0, [('translation', -0.5971142607272544), ('machine', -0.5970148111237235), ('language', -0.3295565899869501), ('corpus', -0.18451525138709726), ('natural', -0.1820524406121053), ('based', -0.11277980081190185), ('neural', -0.08727640624331864)])
(1, [('language', -0.6154347835966626), ('corpus', -0.3943217959966098), ('natural', -0.3531237440857559), ('machine', 0.3204085783387835), ('translation', 0.3021986232959472), ('processing', -0.1567719226393752), ('model', -0.10782592299555536)])
(2, [('corpus', 0.7256188652252339), ('language', -0.3634611821492455), ('natural', -0.28776859553718237), ('national', 0.2109349721338355), ('linguistics', 0.1733937823381026), ('text', 0.1669061683014083), ('speech', 0.1402589227933959)])
(3, [('automaton', -0.7860931324335875), ('finite', -0.4889248734250003), ('deterministic', -0.16666458387232327), ('state', -0.15557112962482988), ('büchi', -0.12409637952299955), ('nondeterministic', -0.11592631693405205), ('probabilistic', -0.0681508145438186)

## Run LDA with the count vectorizer

### 6. Train an LDA model using word counts (1 points)

In [23]:

from gensim.models import LdaModel
lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics)

### 7. Compare at the top 7 words for the top 7 topics from the LDA model (1 point)

In [24]:
for topic in lda.show_topics(num_topics=7, num_words=7, formatted=False):
    print(topic)
    

(1, [('corpus', 0.022556918), ('information', 0.015159833), ('language', 0.011463579), ('project', 0.011463547), ('retrieval', 0.011462624), ('word', 0.007764954), ('translator', 0.007764952)])
(2, [('corpus', 0.025613066), ('software', 0.014344012), ('finite', 0.012965225), ('linguistic', 0.012963871), ('automaton', 0.012963666), ('english', 0.009802865), ('processing', 0.009802757)])
(3, [('language', 0.025923941), ('engineering', 0.013122629), ('natural', 0.009922601), ('semantic', 0.009922087), ('automaton', 0.009921661), ('corpus', 0.0067222547), ('theory', 0.0067213327)])
(7, [('automaton', 0.030969607), ('language', 0.030969474), ('natural', 0.019810943), ('finite', 0.01981091), ('assistant', 0.019808916), ('virtual', 0.01701942), ('machine', 0.017019374)])
(4, [('text', 0.034302272), ('corpus', 0.034301773), ('inscriptionum', 0.010529217), ('analysis', 0.010528217), ('latent', 0.010527699), ('document', 0.007132592), ('semantic', 0.0071323877)])
(8, [('translation', 0.028667431

### 7. Why are the topics and words different between the LDA and LSA models? (1 point)

Answer: LDA is probabilistic, while LSA is deterministic. They use different methodologies and assumptions to get the topics.

### 8. Visualize the topics in the LDA model you trained using pyLDAvis(1 point)

In [25]:
# Install the Python LDA Visualization package
!pip install pyldavis



In [26]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)


### 10. Re-train the LDA model with just 9 topics & visualize with PyLDAVis again (1 point)

In [27]:
#Re-train the LDA model with just 9 topics & visualize with PyLDAVis again
lda_9 = LdaModel(corpus, id2word=dictionary, num_topics=9)
pyLDAvis.gensim_models.prepare(lda_9, corpus, dictionary)

### 11. Looking at the topic visualization above - should we have used a larger or smaller number of topics? Explain... (1 point)

Answer: We should have used a larger number of topics, so that there are fewer overlaps.

## Train embeddings using word2vec

### 12. Train a word2vec model using the common_texts variable from gensim first (1 point)

In [30]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

# Initialize the Word2Vec model
new_model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)

# Build the vocabulary from the common_texts corpus
new_model.build_vocab(common_texts)

# Train the model on the common_texts corpus
new_model.train(common_texts, total_examples=model.corpus_count, epochs=model.epochs)


(17, 145)

In [31]:
# assuming your w2v model is called new_model
new_model.wv['computer']

array([-0.00515774, -0.00667028, -0.0077791 ,  0.00831315, -0.00198292,
       -0.00685696, -0.0041556 ,  0.00514562, -0.00286997, -0.00375075,
        0.0016219 , -0.0027771 , -0.00158482,  0.0010748 , -0.00297881,
        0.00852176,  0.00391207, -0.00996176,  0.00626142, -0.00675622,
        0.00076966,  0.00440552, -0.00510486, -0.00211128,  0.00809783,
       -0.00424503, -0.00763848,  0.00926061, -0.00215612, -0.00472081,
        0.00857329,  0.00428458,  0.0043261 ,  0.00928722, -0.00845554,
        0.00525685,  0.00203994,  0.0041895 ,  0.00169839,  0.00446543,
        0.00448759,  0.0061063 , -0.00320303, -0.00457706, -0.00042664,
        0.00253447, -0.00326412,  0.00605948,  0.00415534,  0.00776685,
        0.00257002,  0.00811904, -0.00138761,  0.00808028,  0.0037181 ,
       -0.00804967, -0.00393476, -0.0024726 ,  0.00489447, -0.00087241,
       -0.00283173,  0.00783599,  0.00932561, -0.0016154 , -0.00516075,
       -0.00470313, -0.00484746, -0.00960562,  0.00137242, -0.00

In [32]:
new_model.wv.most_similar('computer', topn=10)

[('system', 0.21617142856121063),
 ('survey', 0.044689200818538666),
 ('interface', 0.01520337350666523),
 ('time', 0.0019510575802996755),
 ('trees', -0.03284314647316933),
 ('human', -0.0742427185177803),
 ('response', -0.09317588806152344),
 ('graph', -0.09575346857309341),
 ('eps', -0.10513805598020554),
 ('user', -0.16911622881889343)]

### 11. Re-train the word2vec model using our wikipedia paragraph tokens - remember you have to update your model vocabulary first (1 point)

In [33]:
# Update the model's vocabulary with the new sentences
new_model.build_vocab(wiki_page_corpus, update=True)

# Continue training the model with the new sentences
new_model.train(wiki_page_corpus, total_examples=len(wiki_page_corpus), epochs=new_model.epochs)


(40, 275)

In [34]:
print(new_model.wv['computer'])
new_model.wv.most_similar('computer', topn=10)

[-0.00515774 -0.00667028 -0.0077791   0.00831315 -0.00198292 -0.00685696
 -0.0041556   0.00514562 -0.00286997 -0.00375075  0.0016219  -0.0027771
 -0.00158482  0.0010748  -0.00297881  0.00852176  0.00391207 -0.00996176
  0.00626142 -0.00675622  0.00076966  0.00440552 -0.00510486 -0.00211128
  0.00809783 -0.00424503 -0.00763848  0.00926061 -0.00215612 -0.00472081
  0.00857329  0.00428458  0.0043261   0.00928722 -0.00845554  0.00525685
  0.00203994  0.0041895   0.00169839  0.00446543  0.00448759  0.0061063
 -0.00320303 -0.00457706 -0.00042664  0.00253447 -0.00326412  0.00605948
  0.00415534  0.00776685  0.00257002  0.00811904 -0.00138761  0.00808028
  0.0037181  -0.00804967 -0.00393476 -0.0024726   0.00489447 -0.00087241
 -0.00283173  0.00783599  0.00932561 -0.0016154  -0.00516075 -0.00470313
 -0.00484746 -0.00960562  0.00137242 -0.00422615  0.00252744  0.00561612
 -0.00406709 -0.00959937  0.00154715 -0.00670207  0.0024959  -0.00378173
  0.00708048  0.00064041  0.00356198 -0.00273993 -0.0

[('system', 0.21617142856121063),
 ('o', 0.1265621930360794),
 ('_', 0.10314639657735825),
 ('c', 0.08376444876194),
 ('a', 0.077409528195858),
 ('survey', 0.044689200818538666),
 ('k', 0.029901370406150818),
 ('g', 0.029808782041072845),
 ('t', 0.020089803263545036),
 ('interface', 0.01520337350666523)]