# Text Mining Project on Korea Herald

# Load Data

In [21]:
import json
import pandas
import gzip

data_path = '../data/koreaherald_1517_#.json.gz'

for i in range(8):
  p = data_path.replace('#',str(i))
  with gzip.open(p,'rb') as f:
    data=json.load(f)
  if i == 0:
    df_data = pandas.DataFrame.from_dict(data)
  else:
    df_data.append(data,ignore_index=True)

# clean up column names
df_data = df_data.rename(columns={" author": "author",
                        " time": "time",
                        " description": "description",
                        " body": "body",
                        " section": "section",
                       })
# preview data
print('Number of docs: {}'.format(df_data.shape[0]))
df_data.dtypes

Number of docs: 3000


title          object
author         object
time           object
description    object
body           object
section        object
dtype: object

# Pre-Processing

Here we apply:
- tokenisation
- lemmatisation
- normalisation

(optional) Bigrams

In [22]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

def tokenise_pipeline(doc):
  doc = doc.lower()  # Convert to lowercase.
  tokens = tokenizer.tokenize(doc) # split into words
  # TODO: remove stopwords
  tokens = [token for token in tokens if not token.isnumeric()] # remove numbers
  tokens = [token for token in tokens if token not in stop_words]
  tokens = [token for token in tokens if len(token) > 2] # remove words of only 1 letter
  tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatisation
  return tokens

In [23]:
df_data['body_tokenised'] = df_data['body'].apply(tokenise_pipeline)

In [24]:
from gensim.models import Phrases

docs = [doc for doc in df_data['body_tokenised']]
bigrams = Phrases(docs, min_count=20) # keeps only phrases that appear >= 20 times in corpus.

def add_bigrams(doc):
  for token in bigrams[doc]:
    if '_' in token:
      doc.append(token)
  return doc

df_data['body_tokenised'] = df_data['body_tokenised'].apply(add_bigrams)

In [25]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=20, no_above=0.5) # filter words that occur in less than 20 docs or more than 50% docs

print('Number of unique tokens: {}'.format(len(dictionary)))
print('Number of documents: {}'.format(len(docs)))

Number of unique tokens: 4015
Number of documents: 3000


# Vectorization

In [26]:
df_data['body_vector'] = df_data['body_tokenised'].apply(dictionary.doc2bow)

corpus = [vector for vector in df_data['body_vector']]

# LDA

In [30]:
from gensim.models import LdaModel

num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

temp = dictionary[0] # to 'load' dictionary
id2word = dictionary.id2token

model = LdaModel(
  corpus=corpus,
  id2word=id2word,
  chunksize=chunksize,
  alpha='auto',
  eta='auto',
  iterations=iterations,
  num_topics=num_topics,
  passes=passes,
  eval_every=eval_every
        )

In [31]:
top_topics = model.top_topics(corpus)
from pprint import pprint
pprint(top_topics)

[([(0.03338774, 'north_korea'),
   (0.025606295, 'nuclear'),
   (0.011662616, 'missile'),
   (0.009213249, 'pyongyang'),
   (0.009123816, 'state'),
   (0.007556709, 'would'),
   (0.0074498854, 'talk'),
   (0.007373694, 'washington'),
   (0.0067089563, 'weapon'),
   (0.006587418, 'security'),
   (0.006412249, 'north_korean'),
   (0.0058833007, 'trump'),
   (0.005568597, 'foreign'),
   (0.0054894323, 'peninsula'),
   (0.0054238955, 'united'),
   (0.005115163, 'regime'),
   (0.0050715143, 'president'),
   (0.0050171153, 'test'),
   (0.004976555, 'secretary'),
   (0.004960945, 'program')],
  -0.9057546920059091),
 ([(0.07198935, 'party'),
   (0.0209875, 'opposition'),
   (0.016195806, 'ruling'),
   (0.011002585, 'lawmaker'),
   (0.0101263765, 'moon'),
   (0.009617866, 'national'),
   (0.00953391, 'democratic'),
   (0.009348606, 'liberty'),
   (0.008912594, 'rep'),
   (0.00891053, 'liberty_korea'),
   (0.008736632, 'parliamentary'),
   (0.008622215, 'main'),
   (0.00860216, 'government'),
 