<a href="https://colab.research.google.com/github/y4c6/master_thesis/blob/main/EJMR_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import json

In [3]:
from google.colab import drive
# directory
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## preparing data

In [4]:
import glob
import json

def list_json_files(path, start_with): 
    # get a list of json files that starts with specific word 
    json_files = glob.glob(path + f'/{start_with}*.json')
    return json_files

def concat_json_files(file_paths):
    # concatenate the content of all the files in the list
    data = {'topic':[], 'posts':[]}
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_data = json.load(f)
            data['topic'].extend(file_data['topic'])
            data['posts'].extend(file_data['posts'])
    return data

def concat_json_files_with_start(path, start_with):
    json_files = list_json_files(path, start_with)
    data = concat_json_files(json_files)
    return data


In [5]:
path = '/content/gdrive/MyDrive/論文相關材料/'
start_with = 'EJMRpost_'
data = concat_json_files_with_start(path, start_with)

In [15]:
len(data['topic'])

150000

In [1]:
## construct the dictionary to dataframe

In [6]:
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,topic,posts
0,https://www.econjobrumors.com/topic/rbb-or-kari,"[Whose more of a dishonest cringe dooshbag?, I..."
1,https://www.econjobrumors.com/topic/as-ian-ame...,[https://www.nytimes.com/2022/03/06/nyregion/a...
2,https://www.econjobrumors.com/topic/russia-chi...,[Is this the genius outcome the West was looki...


In [7]:
df.to_pickle("./ejmr_20.pkl")

In [9]:
unpickled_df = pd.read_pickle("./ejmr_20.pkl")  
unpickled_df.head(3)

Unnamed: 0,topic,posts
0,https://www.econjobrumors.com/topic/rbb-or-kari,"[Whose more of a dishonest cringe dooshbag?, I..."
1,https://www.econjobrumors.com/topic/as-ian-ame...,[https://www.nytimes.com/2022/03/06/nyregion/a...
2,https://www.econjobrumors.com/topic/russia-chi...,[Is this the genius outcome the West was looki...


## select out the asian-related posts

In [16]:
asian_target = ['asian', 'asia', 'korea', 'korean', 'japan', 'japanese', 'taiwan', 'taiwanese', 'east', 'hongkong']
china_target = ['china', 'chinese']

In [17]:
# initialize an empty list to store the indexes
asian_topic_indices = []

# iterate over the topics list
for i, topic in enumerate(data['topic']):
  # check if any of the target words are in the current topic
  if any(word in topic for word in asian_target):
    # if the target word is found, append the index to the indexes list
    asian_topic_indices.append(i)

print(asian_topic_indices[:15])
len(asian_topic_indices)

[2, 23, 44, 69, 79, 110, 129, 154, 158, 215, 232, 243, 265, 283, 288]


4830

In [20]:
asian_topic_indicator_array = np.zeros(len(data['topic']))
asian_topic_indicator_array[asian_topic_idices] = 1
asian_topic_indicator_array

array([0., 0., 1., ..., 0., 0., 0.])

In [21]:
len(asian_topic_idices) / len( data['posts']) # to see the ratio 

0.0322

In [22]:
asian_posts = [data['posts'][i] for i in asian_topic_idices]

In [None]:
flatten_asian_posts = [' '.join(i).lower() for i in asian_posts]

## Word-embedding (Word2vec) & Topic Modelling (LDA)

It is worth noting that this method is not widely used and there are other ways to combine LDA and word2vec such as using word2vec vectors as priors for LDA or using LDA topics as features for word2vec.
Also, LDA is unsupervised method while word2vec is supervised method, so the way you combine them is important and it depends on your application.
It is recommended to consult with experts in this field and validate the results to see whether this approach is beneficial for your task or not.

ref hw2

In [None]:
## check library
import gensim

## ignore warnings
import warnings
warnings.filterwarnings('ignore')

# # if you want to see the training messages, you can use it
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## the input type
train_df['text_tokenized'] = train_df['text'].apply(lambda x: nltk.word_tokenize(x))
train_df[['id', 'text', 'text_tokenized']].head()

In [None]:
## create the training corpus
training_corpus = train_df['text_tokenized'].values
training_corpus[:3]

In [None]:
#from gensim.models import Word2Vec

## setting
vector_dim = 100
window_size = 5
min_count = 1
training_epochs = 20

## model
word2vec_model = Word2Vec(sentences=training_corpus, 
                          vector_size=vector_dim, window=window_size, 
                          min_count=min_count, epochs=training_epochs)

In [None]:
# Get the most similar words
word = 'happy'
topn = 10
word2vec_model.wv.most_similar(word, topn=topn)

^^^^^^^

In [25]:
from gensim.models import Word2Vec

# Train the Word2Vec model on the sentences
w2v_model = Word2Vec(asian_posts, min_count=1)

In [27]:
word_vectors = w2v_model.wv

In [28]:
type(word_vectors)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [31]:
for item in word_vectors.most_similar('china'):
    print(item)

("Princeton I remember had one of the lowest percentages of Asians in the late 90s, something like 11%. They've doubled that since.", 0.4637858271598816)
('Strong able bodied slàves dvmb enough not to rebel > strong able bodied slàves with iq high enough to rebel', 0.3971552848815918)
('Okay, Igor.', 0.39317411184310913)
('A testament to those who take genuine interest in Taiwan.', 0.3869967460632324)
('if you teach only one course', 0.38309216499328613)
('PIC: https://imgur.com/a/gG1zdXW', 0.37933504581451416)
('global economic boom\nboom in culture\nboom in peace', 0.377951979637146)
('Racial inequity is real.', 0.36936408281326294)
('yes.  Masking outside in open air when you are by yourself is certainly dangerous and you are better to stay at home in china', 0.36899322271347046)
('Their employees would be Chinese.', 0.3626231551170349)
