<a href="https://colab.research.google.com/github/y4c6/master_thesis/blob/main/EJMR_LDA_W2V_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install packages

In [None]:
import numpy as np
import pandas as pd
import json

In [None]:
from google.colab import drive
# directory
drive.mount('/content/gdrive')

Read the pickle file

In [None]:
df = pd.read_pickle("./MyDrive/Thesis_Data&Result/EJMR_ao2_tokens_a0613.pkl")")
df.head(3)

In [None]:
## create the training corpus
corpus_lemmatized = df['lemmatized_tokenized'].values
corpus_stemmed = df['stemmed_tokenized'].values

## Word-embedding (Word2vec) & Topic Modelling (LDA)

It is worth noting that this method is not widely used and there are other ways to combine LDA and word2vec such as using word2vec vectors as priors for LDA or using LDA topics as features for word2vec.
Also, LDA is unsupervised method while word2vec is supervised method, so the way you combine them is important and it depends on your application.
It is recommended to consult with experts in this field and validate the results to see whether this approach is beneficial for your task or not.

In [None]:
gensim.__version__

'3.6.0'

In [None]:
# !pip install --upgrade gensim

### Word2Vec

In [None]:
from gensim.models import Word2Vec

## setting
vector_dim = 100
window_size = 5
min_count = 1
training_epochs = 20

## model
word2vec_model = Word2Vec(sentences=corpus, size=vector_dim) #, window=window_size, min_count=min_count, epochs=training_epochs)

In [None]:
# Get the most similar words
word = 'china'
topn = 30
w2v_china = pd.DataFrame(word2vec_model.wv.most_similar(word, topn=topn))
w2v_china.columns = ['Word', 'Sims']

w2v_china.head(3)

Unnamed: 0,Word,Sims
0,territory,0.808221
1,invade,0.79898
2,tibet,0.798434
3,island,0.798132
4,facto,0.779776
5,nukes,0.775288
6,roc,0.772613
7,belongs,0.77211
8,invasion,0.771001
9,lost,0.763442


In [None]:
# Get the most similar words
word = 'chinese'
topn = 30
w2v_asia = pd.DataFrame(word2vec_model.wv.most_similar(word, topn=topn))
w2v_asia.columns = ['Word', 'Sims']

w2v_asia.head(3)

Unnamed: 0,Word,Sims
0,sympathizers,0.712218
1,taiwanese,0.669642
2,han,0.666017
3,korean,0.660809
4,hold,0.637837
5,twers,0.635374
6,distrust,0.624216
7,brainwashed,0.622997
8,nationality,0.622318
9,japanese,0.62193


### LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a matrix using CountVectorizer
cv = CountVectorizer(preprocessor = preprocess_text,
            tokenizer = tokenize_and_remove_stopwords,
            max_features = 10000)

In [None]:
data_matrix = cv.fit_transform(df['posts'].apply(lambda x: ' '.join(x)))

# Print the matrix
print(data_matrix.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


use LDA from sklearn

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(data_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [None]:
for i,topic in enumerate(LDA.components_):
    print(f"TOP 10 WORDS PER TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-10:]])

TOP 10 WORDS PER TOPIC #0




['people', 'like', '’', 'culture', 'japan', 'japanese', 'korean', 'china', 'korea', 'chinese']
TOP 10 WORDS PER TOPIC #1
['know', 'email', 'would', 'anyone', 'test', 'got', 'one', 'offer', 'u', 'w']
TOP 10 WORDS PER TOPIC #2
['americans', 'get', 'would', 'dont', 'us', 'asians', 'like', 'people', '’', 'asian']
TOP 10 WORDS PER TOPIC #3
['’', 'dont', 'get', 'like', 'china', 'people', 'good', 'us', 'students', 'chinese']
TOP 10 WORDS PER TOPIC #4
['world', 'like', 'ccp', 'taiwan', 'would', 'people', '’', 'chinese', 'us', 'china']


use LDA from gensim

In [None]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(df['text_tokenized'])

# Create Corpus
texts = df['text_tokenized']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
corpus[0][:15]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 3),
 (5, 3),
 (6, 2),
 (7, 1),
 (8, 2),
 (9, 1),
 (10, 3),
 (11, 2),
 (12, 1),
 (13, 2),
 (14, 1)]

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]][0][:15]

[('action', 1),
 ('actually', 1),
 ('agree', 1),
 ('aid', 1),
 ('al', 3),
 ('allies', 3),
 ('ally', 2),
 ('alone', 1),
 ('already', 2),
 ('also', 1),
 ('alternatives', 3),
 ('always', 2),
 ('anything', 1),
 ('arent', 2),
 ('assumed', 1)]

In [None]:
from gensim.models import LdaModel

# Train the model on the corpus.
lda = LdaModel(corpus,
        id2word=id2word,
        num_topics=10,
        random_state=42)



In [None]:
lda.print_topics(num_topics=5, num_words=10)

[(0,
  '0.011*"chinese" + 0.010*"people" + 0.007*"china" + 0.007*"’" + 0.006*"like" + 0.005*"dont" + 0.005*"would" + 0.004*"us" + 0.004*"even" + 0.004*"korea"'),
 (1,
  '0.026*"china" + 0.017*"us" + 0.007*"people" + 0.007*"chinese" + 0.007*"would" + 0.006*"country" + 0.006*"’" + 0.006*"world" + 0.005*"like" + 0.005*"ccp"'),
 (2,
  '0.009*"people" + 0.008*"chinese" + 0.007*"’" + 0.007*"asian" + 0.006*"dont" + 0.005*"china" + 0.005*"women" + 0.005*"one" + 0.005*"even" + 0.004*"also"'),
 (3,
  '0.017*"china" + 0.010*"chinese" + 0.006*"’" + 0.005*"people" + 0.005*"good" + 0.004*"like" + 0.004*"also" + 0.004*"covid" + 0.004*"one" + 0.004*"know"'),
 (4,
  '0.010*"’" + 0.008*"people" + 0.007*"china" + 0.006*"chinese" + 0.005*"like" + 0.004*"dont" + 0.004*"country" + 0.003*"”" + 0.003*"asian" + 0.003*"“"'),
 (5,
  '0.010*"japan" + 0.008*"like" + 0.008*"people" + 0.007*"’" + 0.005*"asian" + 0.005*"get" + 0.004*"think" + 0.004*"japanese" + 0.004*"china" + 0.004*"chinese"'),
 (6,
  '0.020*"china"