# Text Mining Project on Korea Herald

# Data Preparation

## Load Data

In [1]:
import json
import pandas as pd
import gzip
import numpy as np

data_path = '../data/koreaherald_1517_#.json.gz'

df_data = pd.DataFrame()

for i in range(8):
  p = data_path.replace('#',str(i))
  with gzip.open(p,'rb') as f:
    data = pd.DataFrame.from_dict(json.load(f))
  df_data = df_data.append(data,ignore_index=True)

# clean up column names
df_data = df_data.rename(columns={" author": "author",
                        " time": "time",
                        " description": "description",
                        " body": "body",
                        " section": "section",
                       })
# preview data
print('Number of docs: {}'.format(df_data.shape[0]))
df_data.dtypes

Number of docs: 23769


title          object
author         object
time           object
description    object
body           object
section        object
dtype: object

## Pre-Processing

Here we apply:
- tokenisation
- lemmatisation
- normalisation

(optional) Bigrams

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
whitelist = {'US':'United States'}

def tokenise_pipeline(doc):
  for k,v in whitelist:
    doc = doc.replace(k,v)
  doc = doc.lower()  # Convert to lowercase.
  tokens = tokenizer.tokenize(doc) # split into words
  # TODO: remove stopwords
  tokens = [token for token in tokens if not token.isnumeric()] # remove numbers
  tokens = [token for token in tokens if token not in stop_words]
  tokens = [token for token in tokens if len(token) > 2] # remove words of only 1 letter
  tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatisation
  return tokens

In [3]:
df_data['body_tokenised'] = df_data['body'].apply(tokenise_pipeline)

In [4]:
from gensim.models import Phrases

bigrams = Phrases(df_data['body_tokenised'].values, min_count=20) # keeps only phrases that appear >= 20 times in corpus.

def add_bigrams(doc):
  for token in bigrams[doc]:
    if '_' in token:
      doc.append(token)
  return doc

df_data['body_tokenised'] = df_data['body_tokenised'].apply(add_bigrams)

In [5]:
from gensim.corpora import Dictionary

dictionary = Dictionary(df_data['body_tokenised'].values)
dictionary.filter_extremes(no_below=20, no_above=0.5) # filter words that occur in less than 20 docs or more than 50% docs

print('Number of unique tokens: {}'.format(len(dictionary)))
print('Number of documents: {}'.format(len(df_data['body_tokenised'].values)))

Number of unique tokens: 16614
Number of documents: 23769


## Vectorization

In [6]:
# BOW vectorisation
# TFIDF vectorisation
from gensim.models import TfidfModel

corpus = [dictionary.doc2bow(doc) for doc in df_data['body_tokenised']]
tfidf = TfidfModel(corpus)
df_data['body_bow'] = df_data['body_tokenised'].apply(dictionary.doc2bow)
df_data['body_tfidf'] = df_data['body_bow'].apply(lambda doc : tfidf[doc])

## Group By Year

In [7]:
import copy

df_2015 = copy.deepcopy(df_data[df_data['time'].str.contains('2015')])
df_2016 = copy.deepcopy(df_data[df_data['time'].str.contains('2016')])
df_2017 = copy.deepcopy(df_data[df_data['time'].str.contains('2017')])

# 2015 Analysis

## Get n_topics by evaluating Coherence

In [8]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
import seaborn as sns
from tqdm import tqdm

temp = dictionary[0] # to 'load' dictionary
id2word = dictionary.id2token

# parameters
feature = 'body_bow' # lda uses bow
corpus = df_2015[feature].values

results = []

for i in tqdm(range(10,31)):
  model = LdaModel(
    corpus = corpus,
    id2word = id2word,
    chunksize = 2000,
    alpha = 'auto',
    eta = 'auto',
    iterations = 400,
    num_topics = i,
    passes = 20,
    eval_every = None
    )
  cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
  coherence = cm.get_coherence()
  results.append([i,coherence])
  
  
best_score = max([i[1] for i in results])
best_num_topics = [i[0] for i in results if i[1] == best_score][0]
sns.lineplot(x=[i[0] for i in results], y=[i[1] for i in results])
print('Best Coherence Score: {}, Number of Topics: {}'.format(best_score, best_num_topics))

100%|██████████| 21/21 [56:17<00:00, 160.85s/it] 

Best Coherence Score: -1.3123611286143622, Number of Topics: 11





## LDA

In [9]:
from gensim.models import LdaModel

temp = dictionary[0] # to 'load' dictionary
id2word = dictionary.id2token

# parameters
feature = 'body_bow' # lda uses bow
corpus = df_2015[feature].values
num_topics = best_num_topics

model_2015 = LdaModel(
  corpus = corpus,
  id2word = id2word,
  chunksize = 2000,
  alpha = 'auto',
  eta = 'auto',
  iterations = 400,
  num_topics = num_topics,
  passes = 20,
  eval_every = None
  )

### LDA Visualisation

In [10]:
import pyLDAvis
from pyLDAvis import gensim

pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(model_2015, corpus, dictionary)

# The larger the bubble, the more prevalent is that topic.
# A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
# A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Assign Topics to Docs

In [11]:
def argmax(ls):
  return max(ls, key = lambda item: item[1])


df_2015['topic'] = df_2015[feature].apply(lambda x : argmax(model_2015.get_document_topics(x))[0])
df_2015['topic_confidence'] = df_2015[feature].apply(lambda x : argmax(model_2015.get_document_topics(x))[1])

## Cluster Analysis

### get only documents with topic_confidence >= t = 0.85

In [12]:
df_anal = df_2015[df_2015['topic_confidence'] >= 0.85]

## Metric 1: Number of Documents

In [13]:
df_anal.topic.value_counts()

7     245
10    129
4      99
0      32
2      24
1      16
6      15
9      13
8      13
5      12
3       6
Name: topic, dtype: int64

## Metric 2: Duration of Reporting

In [14]:
from datetime import datetime

report_duration = pd.DataFrame(columns=['topic','duration'])

for i in range(df_anal.topic.nunique()):
  df = df_anal[df_anal['topic'] == i]
  max_t = datetime.strptime(df.time.max(), "%Y-%m-%d %H:%M:%S").date()
  min_t = datetime.strptime(df.time.min(), "%Y-%m-%d %H:%M:%S").date()
  report_duration = report_duration.append(pd.DataFrame({'topic': i, 'duration': (max_t - min_t)}, index=[i]))
  
report_duration.sort_values('duration',ascending=False)

TypeError: object of type 'int' has no len()

## Metric 3: Entity Relations

In [None]:
import spacy
nlp = spacy.load("en")

# Labels: ORG, LOC, PERSON
def get_entities(txt, label):
  #the NER model only takes in unicode so convert str to unicode
  doc = nlp(txt)
  
  #doc now contains all the entities identified
  NER_list = [((ent.text).encode("utf-8"),ent.label_) for ent in doc.ents]

  #this line is to remove duplicate entities
  NER_list = list(dict.fromkeys(NER_list))

  #converting list of tuples to list of lists for easier manipulation later
  NER_list_final = [list(entity) for entity in NER_list]

  #Filtering for the entity labels that we are interested in.
  entities = [entity[0] for entity in NER_list_final if entity[1] == label]
  
  return entities

df_anal['ORGS'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'ORG'))
df_anal['LOC'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'LOC'))
df_anal['PERSON'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'PERSON'))

In [None]:
entity_counts = pd.DataFrame(columns=['topic','n_org','n_loc','n_person'])

for i in range(df_anal.topic.nunique()):
  df = df_anal[df_anal['topic'] == i]
  n_orgs = len(set([x for ls in df['ORGS'] for x in ls]))
  n_loc = len(set([x for ls in df['LOC'] for x in ls]))        
  n_person = len(set([x for ls in df['PERSON'] for x in ls]))
  entity_counts = entity_counts.append(pd.DataFrame({'topic': i,'n_org': n_orgs,'n_loc': n_loc,'n_person': n_person,'n_total': n_orgs + n_loc + n_person}, index=[i]))
  
entity_counts.sort_values('n_total',ascending=False)

# 2016 Analysis

## Get n_topics by evaluating Coherence

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
import seaborn as sns
from tqdm import tqdm

temp = dictionary[0] # to 'load' dictionary
id2word = dictionary.id2token

# parameters
feature = 'body_bow' # lda uses bow
corpus = df_2016[feature].values

results = []

for i in tqdm(range(10,31)):
  model = LdaModel(
    corpus = corpus,
    id2word = id2word,
    chunksize = 2000,
    alpha = 'auto',
    eta = 'auto',
    iterations = 400,
    num_topics = i,
    passes = 20,
    eval_every = None
    )
  cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
  coherence = cm.get_coherence()
  results.append([i,coherence])
  
best_score = max([i[1] for i in results])
best_num_topics = [i[0] for i in results if i[1] == best_score][0]
sns.lineplot(x=[i[0] for i in results], y=[i[1] for i in results])
print('Best Coherence Score: {}, Number of Topics: {}'.format(best_score, best_num_topics))

## LDA

In [None]:
from gensim.models import LdaModel

temp = dictionary[0] # to 'load' dictionary
id2word = dictionary.id2token

# parameters
feature = 'body_bow' # lda uses bow
corpus = df_2016[feature].values
num_topics = best_num_topics

model_2016 = LdaModel(
  corpus = corpus,
  id2word = id2word,
  chunksize = 2000,
  alpha = 'auto',
  eta = 'auto',
  iterations = 400,
  num_topics = num_topics,
  passes = 20,
  eval_every = None
  )

### LDA Visualisation

In [None]:
import pyLDAvis
from pyLDAvis import gensim

pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(model_2016, corpus, dictionary)

# The larger the bubble, the more prevalent is that topic.
# A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
# A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

### Assign Topics to Docs

In [None]:
def argmax(ls):
  return max(ls, key = lambda item: item[1])


df_2016['topic'] = df_2016[feature].apply(lambda x : argmax(model_2016.get_document_topics(x))[0])
df_2016['topic_confidence'] = df_2016[feature].apply(lambda x : argmax(model_2016.get_document_topics(x))[1])

## Cluster Analysis

### get only documents with topic_confidence >= t = 0.85

In [None]:
df_anal = df_2016[df_2016['topic_confidence'] >= 0.85]

## Metric 1: Number of Documents

In [None]:
df_anal.topic.value_counts()

## Metric 2: Duration of Reporting

In [None]:
from datetime import datetime

report_duration = pd.DataFrame(columns=['topic','duration'])

for i in range(df_anal.topic.nunique()):
  df = df_anal[df_anal['topic'] == i]
  max_t = datetime.strptime(df.time.max(), "%Y-%m-%d %H:%M:%S").date()
  min_t = datetime.strptime(df.time.min(), "%Y-%m-%d %H:%M:%S").date()
  report_duration = report_duration.append(pd.DataFrame({'topic': i, 'duration': (max_t - min_t)}, index=[i]))
  
report_duration.sort_values('duration',ascending=False)

## Metric 3: Entity Relations

In [None]:
import spacy
nlp = spacy.load("en")

# Labels: ORG, LOC, PERSON
def get_entities(txt, label):
  #the NER model only takes in unicode so convert str to unicode
  doc = nlp(txt)
  
  #doc now contains all the entities identified
  NER_list = [((ent.text).encode("utf-8"),ent.label_) for ent in doc.ents]

  #this line is to remove duplicate entities
  NER_list = list(dict.fromkeys(NER_list))

  #converting list of tuples to list of lists for easier manipulation later
  NER_list_final = [list(entity) for entity in NER_list]

  #Filtering for the entity labels that we are interested in.
  entities = [entity[0] for entity in NER_list_final if entity[1] == label]
  
  return entities

df_anal['ORGS'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'ORG'))
df_anal['LOC'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'LOC'))
df_anal['PERSON'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'PERSON'))

In [None]:
entity_counts = pd.DataFrame(columns=['topic','n_org','n_loc','n_person'])

for i in range(df_anal.topic.nunique()):
  df = df_anal[df_anal['topic'] == i]
  n_orgs = len(set([x for ls in df['ORGS'] for x in ls]))
  n_loc = len(set([x for ls in df['LOC'] for x in ls]))        
  n_person = len(set([x for ls in df['PERSON'] for x in ls]))
  entity_counts = entity_counts.append(pd.DataFrame({'topic': i,'n_org': n_orgs,'n_loc': n_loc,'n_person': n_person,'n_total': n_orgs + n_loc + n_person}, index=[i]))
  
entity_counts.sort_values('n_total',ascending=False)

# 2017 Analysis

## Get n_topics by evaluating Coherence

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
import seaborn as sns
from tqdm import tqdm

temp = dictionary[0] # to 'load' dictionary
id2word = dictionary.id2token

# parameters
feature = 'body_bow' # lda uses bow
corpus = df_2017[feature].values

results = []

for i in tqdm(range(10,31)):
  model = LdaModel(
    corpus = corpus,
    id2word = id2word,
    chunksize = 2000,
    alpha = 'auto',
    eta = 'auto',
    iterations = 400,
    num_topics = i,
    passes = 20,
    eval_every = None
    )
  cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
  coherence = cm.get_coherence()
  results.append([i,coherence])
  
best_score = max([i[1] for i in results])
best_num_topics = [i[0] for i in results if i[1] == best_score][0]
sns.lineplot(x=[i[0] for i in results], y=[i[1] for i in results])
print('Best Coherence Score: {}, Number of Topics: {}'.format(best_score, best_num_topics))

## LDA

In [None]:
from gensim.models import LdaModel

temp = dictionary[0] # to 'load' dictionary
id2word = dictionary.id2token

# parameters
feature = 'body_bow' # lda uses bow
corpus = df_2017[feature].values
num_topics = best_num_topics

model_2017 = LdaModel(
  corpus = corpus,
  id2word = id2word,
  chunksize = 2000,
  alpha = 'auto',
  eta = 'auto',
  iterations = 400,
  num_topics = num_topics,
  passes = 20,
  eval_every = None
  )

### LDA Visualisation

In [None]:
import pyLDAvis
from pyLDAvis import gensim

pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(model_2017, corpus, dictionary)

# The larger the bubble, the more prevalent is that topic.
# A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
# A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

### Assign Topics to Docs

In [None]:
def argmax(ls):
  return max(ls, key = lambda item: item[1])


df_2017['topic'] = df_2017[feature].apply(lambda x : argmax(model_2017.get_document_topics(x))[0])
df_2017['topic_confidence'] = df_2017[feature].apply(lambda x : argmax(model_2017.get_document_topics(x))[1])

## Cluster Analysis

### get only documents with topic_confidence >= t = 0.85

In [None]:
df_anal = df_2017[df_2017['topic_confidence'] >= 0.85]

## Metric 1: Number of Documents

In [None]:
df_anal.topic.value_counts()

## Metric 2: Duration of Reporting

In [None]:
from datetime import datetime

report_duration = pd.DataFrame(columns=['topic','duration'])

for i in range(df_anal.topic.nunique()):
  df = df_anal[df_anal['topic'] == i]
  max_t = datetime.strptime(df.time.max(), "%Y-%m-%d %H:%M:%S").date()
  min_t = datetime.strptime(df.time.min(), "%Y-%m-%d %H:%M:%S").date()
  report_duration = report_duration.append(pd.DataFrame({'topic': i, 'duration': (max_t - min_t)}, index=[i]))
  
report_duration.sort_values('duration',ascending=False)

## Metric 3: Entity Relations

In [None]:
import spacy
nlp = spacy.load("en")

# Labels: ORG, LOC, PERSON
def get_entities(txt, label):
  #the NER model only takes in unicode so convert str to unicode
  doc = nlp(txt)
  
  #doc now contains all the entities identified
  NER_list = [((ent.text).encode("utf-8"),ent.label_) for ent in doc.ents]

  #this line is to remove duplicate entities
  NER_list = list(dict.fromkeys(NER_list))

  #converting list of tuples to list of lists for easier manipulation later
  NER_list_final = [list(entity) for entity in NER_list]

  #Filtering for the entity labels that we are interested in.
  entities = [entity[0] for entity in NER_list_final if entity[1] == label]
  
  return entities

df_anal['ORGS'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'ORG'))
df_anal['LOC'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'LOC'))
df_anal['PERSON'] = df_anal['body_tokenised'].apply(lambda x: get_entities(' '.join(x), 'PERSON'))

In [None]:
entity_counts = pd.DataFrame(columns=['topic','n_org','n_loc','n_person'])

for i in range(df_anal.topic.nunique()):
  df = df_anal[df_anal['topic'] == i]
  n_orgs = len(set([x for ls in df['ORGS'] for x in ls]))
  n_loc = len(set([x for ls in df['LOC'] for x in ls]))        
  n_person = len(set([x for ls in df['PERSON'] for x in ls]))
  entity_counts = entity_counts.append(pd.DataFrame({'topic': i,'n_org': n_orgs,'n_loc': n_loc,'n_person': n_person,'n_total': n_orgs + n_loc + n_person}, index=[i]))
  
entity_counts.sort_values('n_total',ascending=False)