<a href="https://colab.research.google.com/github/y4c6/master_thesis/blob/main/EJMR_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#!pip install gensim==4.3.0

In [3]:
#!pip install --upgrade numpy

In [4]:
import numpy as np
import pandas as pd
import json

In [5]:
from google.colab import drive
# directory
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## preparing data

In [6]:
import glob
import json

def list_json_files(path, start_with): 
    # get a list of json files that starts with specific word 
    json_files = glob.glob(path + f'/{start_with}*.json')
    return json_files

def concat_json_files(file_paths):
    # concatenate the content of all the files in the list
    data = {'topic':[], 'posts':[]}
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_data = json.load(f)
            data['topic'].extend(file_data['topic'])
            data['posts'].extend(file_data['posts'])
    return data

def concat_json_files_with_start(path, start_with):
    json_files = list_json_files(path, start_with)
    data = concat_json_files(json_files)
    return data


In [7]:
path = '/content/gdrive/MyDrive/論文相關材料/'
start_with = 'EJMRpost_'
data = concat_json_files_with_start(path, start_with)

In [8]:
len(data['topic'])

150000

In [9]:
## construct the dictionary to dataframe

In [10]:
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,topic,posts
0,https://www.econjobrumors.com/topic/rbb-or-kari,"[Whose more of a dishonest cringe dooshbag?, I..."
1,https://www.econjobrumors.com/topic/as-ian-ame...,[https://www.nytimes.com/2022/03/06/nyregion/a...
2,https://www.econjobrumors.com/topic/russia-chi...,[Is this the genius outcome the West was looki...


In [11]:
df.to_pickle("./ejmr_20.pkl")

In [12]:
unpickled_df = pd.read_pickle("./ejmr_20.pkl")  
unpickled_df.head(3)

Unnamed: 0,topic,posts
0,https://www.econjobrumors.com/topic/rbb-or-kari,"[Whose more of a dishonest cringe dooshbag?, I..."
1,https://www.econjobrumors.com/topic/as-ian-ame...,[https://www.nytimes.com/2022/03/06/nyregion/a...
2,https://www.econjobrumors.com/topic/russia-chi...,[Is this the genius outcome the West was looki...


## select out the asian-related posts

In [13]:
asia_target = ['asian', 'asia', 'korea', 'korean', 'japan', 'japanese', 'taiwan', 'taiwanese', 'east', 'hongkong']
china_target = ['china', 'chinese']

In [15]:
# Defining all the conditions inside a function
def condition(x):
    if any( word in x for word in asia_target): #series.str.contains('Mel').any()
        return "asia"
    elif any( word in x for word in china_target):
        return "china"
    else:
        return "other"
 
# Applying the conditions
df['Target'] = df['topic'].apply(condition)
df.head(3)

Unnamed: 0,topic,posts,Target
0,https://www.econjobrumors.com/topic/rbb-or-kari,"[Whose more of a dishonest cringe dooshbag?, I...",other
1,https://www.econjobrumors.com/topic/as-ian-ame...,[https://www.nytimes.com/2022/03/06/nyregion/a...,other
2,https://www.econjobrumors.com/topic/russia-chi...,[Is this the genius outcome the West was looki...,china


In [16]:
( len(df[df['Target']=='china']) + len(df[df['Target']=='asia']) )/ len(df[df['Target']=='other'])

0.033271337053110146

In [17]:
len(df[df['Target']=='china']) / (len(df[df['Target']=='china']) + len(df[df['Target']=='asia']))

0.5534161490683229

In [18]:
len(df[df['Target']=='china']) / len(df[df['Target']=='asia'])

1.239221140472879

## Prepare y

In [19]:
df = df.drop(df[df['Target'] == 'other'].index)
df.head(3)

Unnamed: 0,topic,posts,Target
2,https://www.econjobrumors.com/topic/russia-chi...,[Is this the genius outcome the West was looki...,china
23,https://www.econjobrumors.com/topic/are-more-t...,"[Not even accounting for Indians., All the qua...",asia
44,https://www.econjobrumors.com/topic/reminder-c...,"[Never forget. Never forgive., .]",china


In [20]:
df['y'] = df['Target'].apply(lambda x: 1 if x == 'china' else 0)
df.head(3)

Unnamed: 0,topic,posts,Target,y
2,https://www.econjobrumors.com/topic/russia-chi...,[Is this the genius outcome the West was looki...,china,1
23,https://www.econjobrumors.com/topic/are-more-t...,"[Not even accounting for Indians., All the qua...",asia,0
44,https://www.econjobrumors.com/topic/reminder-c...,"[Never forget. Never forgive., .]",china,1


## Prepare X

In [21]:
import nltk
nltk.download('punkt') #this is download for tonkenizer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Define a function to preprocess text
def preprocess_text(text):
  # Lowercase all characters
  text = text.lower()

  # Remove URLs
  text = re.sub(r'https?://\S+', '', text)

  # Remove digits
  text = text.translate(str.maketrans('', '', string.digits))

  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))

  return text

# Define a function to tokenize and remove stopwords from text
def tokenize_and_remove_stopwords(text):
  # Tokenize the article
  text_tokens = word_tokenize(text)

  # Load English stopwords
  stop_words = set(stopwords.words('english'))

  # Remove stopwords
  tokens = [token for token in text_tokens if token not in stop_words]

  return tokens

In [25]:
## check library
import gensim

## ignore warnings
import warnings
warnings.filterwarnings('ignore')

# # if you want to see the training messages, you can use it
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## the input type
df['text_tokenized'] = df['posts'].apply(lambda x: tokenize_and_remove_stopwords(preprocess_text( ' '.join(x) ))) 
df[['y', 'text_tokenized']].head(3)

Unnamed: 0,y,text_tokenized
2,1,"[genius, outcome, west, looking, russian, fina..."
23,0,"[even, accounting, indians, quant, clubs, univ..."
44,1,"[never, forget, never, forgive]"


In [26]:
## create the training corpus
corpus = df['text_tokenized'].values
corpus[0][:5]

['genius',
 'outcome',
 'west',
 'looking',
 'russian',
 'finances',
 'trade',
 'china',
 'rather',
 'west',
 'brandon',
 'et',
 'al',
 'true',
 'failures',
 'regimechange',
 'genius',
 'outcome',
 'west',
 'looking',
 'russian',
 'finances',
 'trade',
 'china',
 'rather',
 'west',
 'brandon',
 'et',
 'al',
 'true',
 'failures',
 'ljl',
 'would',
 'never',
 'coordinated',
 'action',
 'well',
 'allies',
 'whole',
 'presidency',
 'marked',
 'chaos',
 'op',
 'tribal',
 'idt',
 'china',
 'russia',
 'already',
 'allies',
 'isnt',
 'new',
 'russia',
 'chinese',
 'vassal',
 'state',
 'instead',
 'ally',
 'russia',
 'depends',
 'oil',
 'gas',
 'mineral',
 'exports',
 'go',
 'china',
 'available',
 'alternatives',
 'china',
 'giving',
 'barrel',
 'oil',
 'russia',
 'grateful',
 'bravo',
 'putin',
 'youve',
 'really',
 'raised',
 'russias',
 'standing',
 'world',
 'good',
 'xi',
 'putin',
 'shoots',
 'foot',
 'putin',
 'may',
 'even',
 'offer',
 'vladivostok',
 'xi',
 'penance',
 'however',
 'go

## Word-embedding (Word2vec) & Topic Modelling (LDA)

It is worth noting that this method is not widely used and there are other ways to combine LDA and word2vec such as using word2vec vectors as priors for LDA or using LDA topics as features for word2vec.
Also, LDA is unsupervised method while word2vec is supervised method, so the way you combine them is important and it depends on your application.
It is recommended to consult with experts in this field and validate the results to see whether this approach is beneficial for your task or not.

In [28]:
gensim.__version__

'3.6.0'

In [None]:
# !pip install --upgrade gensim

### Word2Vec

In [32]:
from gensim.models import Word2Vec

## setting
vector_dim = 100
window_size = 5
min_count = 1
training_epochs = 20

## model
word2vec_model = Word2Vec(sentences=corpus, size=vector_dim) #, window=window_size, min_count=min_count, epochs=training_epochs)

In [56]:
# Get the most similar words
word = 'china'
topn = 30
w2v_china = pd.DataFrame(word2vec_model.wv.most_similar(word, topn=topn))
w2v_china.columns = ['Word', 'Sims']

w2v_china.head(30)

Unnamed: 0,Word,Sims
0,territory,0.808221
1,invade,0.79898
2,tibet,0.798434
3,island,0.798132
4,facto,0.779776
5,nukes,0.775288
6,roc,0.772613
7,belongs,0.77211
8,invasion,0.771001
9,lost,0.763442


In [57]:
# Get the most similar words
word = 'chinese'
topn = 30
w2v_asia = pd.DataFrame(word2vec_model.wv.most_similar(word, topn=topn))
w2v_asia.columns = ['Word', 'Sims']

w2v_asia.head(30)

Unnamed: 0,Word,Sims
0,sympathizers,0.712218
1,taiwanese,0.669642
2,han,0.666017
3,korean,0.660809
4,hold,0.637837
5,twers,0.635374
6,distrust,0.624216
7,brainwashed,0.622997
8,nationality,0.622318
9,japanese,0.62193


### LDA

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a matrix using CountVectorizer
vectorizer = CountVectorizer(preprocessor = preprocess_text, 
                tokenizer = tokenize_and_remove_stopwords, 
                max_features = 10000)

In [53]:
vec = df['posts'].apply(lambda x: vectorizer.fit_transform(x))

# Print the matrix
#print(posts_matrix.toarray())

ValueError: ignored

In [None]:
from gensim.test.utils import common_corpus
>>>
lda = LdaModel(common_corpus, num_topics=10)

In [48]:
from gensim.models import LdaModel

# Train the model on the corpus.
lda = LdaModel((df.y, vectorizer))



TypeError: ignored