# Load Data

In [12]:
import json
import pandas
import gzip

data_path = '../data/koreaherald_1517_#.json.gz'

for i in range(8):
  p = data_path.replace('#',str(i))
  with gzip.open(p,'rb') as f:
    data=json.load(f)
  if i == 0:
    df_data = pandas.DataFrame.from_dict(data)
  else:
    df_data.append(data,ignore_index=True)

# clean up column names
df_data = df_data.rename(columns={" author": "author",
                        " time": "time",
                        " description": "description",
                        " body": "body",
                        " section": "section",
                       })
# preview data
df_data.dtypes

title          object
author         object
time           object
description    object
body           object
section        object
dtype: object

# Pre-Processing

Here we apply:
- tokenisation
- lemmatisation
- normalisation

(optional) Bigrams

In [27]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

def tokenise_pipeline(doc):
  doc = doc.lower()  # Convert to lowercase.
  tokens = tokenizer.tokenize(doc) # split into words
  # TODO: remove stopwords
  tokens = [token for token in tokens if not token.isnumeric()] # remove numbers
  tokens = [token for token in tokens if len(token) > 2] # remove words of only 1 letter
  tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatisation
  return tokens

In [24]:
df_data['body_tokenised'] = df_data['body'].apply(tokenise_pipeline)

In [25]:
from gensim.models import Phrases

bigrams = Phrases([doc for doc in df_data['body_tokenised']], min_count=20) # keeps only phrases that appear >= 20 times in corpus.

def add_bigrams(doc):
  for token in bigrams[doc]:
    if '_' in token:
      doc.append(token)
  return doc

df_data['body_tokenised'] = df_data['body_tokenised'].apply(add_bigrams)