# Dataset preparation


- Extract from database

- Accent removal [TODO]
- Lowercase
- Simple tokenization (word spliting)
- Lemmatization (before phrasing, to enhance phrasing)
- Bigram/trigram phrase replacement
- Remove too-short documents (low 10%)
- Stop word removal (after phrasing, to include phrases with stopwords) 
- Corpus frequency filtering (remove terms present in >50% documents or only present in <0.1% documents)

- Generate loadable datasets for `sklearn` and `tomotopy`

In [1]:
%load_ext autotime

import psycopg2
import gensim
import sklearn
import nltk
import re
import tomotopy as tp
import numpy as np
import multiprocessing
from tqdm import tqdm
from time import time

DATASET_NAME = '400k-lemma-nophrase-v2'
N_SAMPLES = 400e3
DO_LEMMATIZE = True
DO_PHRASING = False
DO_PHRASE_INCLUDE_SPLIT = True
TEST_RATIO = 0.2

# DATASET_NAME = '200k-lemma-nophrase-v2'
# N_SAMPLES = 200e3
# DO_LEMMATIZE = True
# DO_PHRASING = False
# DO_PHRASE_INCLUDE_SPLIT = True
# TEST_RATIO = 0.2

# DATASET_NAME = '20k-lemma-nophrase-v2'
# N_SAMPLES = 20e3
# DO_LEMMATIZE = True
# DO_PHRASING = False
# DO_PHRASE_INCLUDE_SPLIT = True
# TEST_RATIO = 0.2

# DATASET_NAME = 'small'
# N_SAMPLES = 5e3
# DO_LEMMATIZE = True
# DO_PHRASING = False
# DO_PHRASE_INCLUDE_SPLIT = True
# TEST_RATIO = 0.2

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/mezis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mezis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mezis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mezis/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import collections
Doc = collections.namedtuple('Doc', ['id', 'raw', 'tokens'], defaults=(None,))

time: 937 µs


In [3]:
print("Connecting to DB...")
conn = psycopg2.connect("dbname='peanut_prod' user='mezis'")
cur = conn.cursor()

print("Loading dataset...")
cur.execute(f"""
    SELECT p.id, CONCAT_WS(' ', title, body) AS text
    FROM post p
    JOIN users u ON u.id = p.author_id
    WHERE TRUE
      AND u.account_status = 'active'
      AND p.status = 'active'
    ORDER BY p.id DESC
    LIMIT {int(N_SAMPLES)}
""")
documents_raw = [Doc(id=row[0], raw=row[1]) for row in cur.fetchall()]

print("%d samples" % len(documents_raw))
documents = documents_raw

Connecting to DB...
Loading dataset...
400000 samples
time: 1.57 s


In [4]:
STOPWORDS = nltk.corpus.stopwords.words('english') + [
    'na' # from tokenizing 'wanna', 'gonna'
]

time: 2.13 ms


In [5]:
class MyTokenizer:
    def __init__(self):
        self.wnl = nltk.stem.WordNetLemmatizer()
        self.re = re.compile(r"[a-z0-9']")
        self.map = {
            "'m":     "am",
            "n't":    "not",
            "'d":     "would",
            "'ll":    "will",
            "'ve":    "have"
        }
        
    def __call__(self, doc):
        tokens = []
        for token in nltk.word_tokenize(doc.raw.lower()):
            token = self.map.get(token, token)
            # FIXME: Lemmatize using spaCy, only for adv/adv/noun POS?
            if DO_LEMMATIZE:
                token = self.wnl.lemmatize(token)
            if not self.re.match(token): continue
            tokens.append(token)
        return Doc(raw=doc.raw, id=doc.id, tokens=tokens)

##################################

documents_preprocessed = []
tokenizer = MyTokenizer()
with multiprocessing.Pool(processes=16) as pool:
    for doc in pool.map(tokenizer, tqdm(documents)):
        documents_preprocessed.append(doc)
documents = documents_preprocessed

100%|██████████| 400000/400000 [00:19<00:00, 20915.59it/s]


time: 24.3 s


In [6]:
phrases = gensim.models.phrases.Phrases(
    sentences=map(lambda d: d.tokens, tqdm(documents)),
    min_count=200, # only bigrams with this corpus frequency
    threshold=10,  # opaque scoring threshold
    common_terms=STOPWORDS,
    scoring='default'
)

x = list(phrases.export_phrases(map(lambda d: d.tokens, tqdm(documents))))
print('Top bigrams')
print(list(enumerate(sorted(set(x), key=lambda t: -t[1])))[:10])

if DO_PHRASING:
    import re
    re_phrase = re.compile('_')
    documents_phrased = []
    for doc in tqdm(documents):
        tokens = []
        for tok in phrases[doc.tokens]:
            tokens.append(tok)
            if DO_PHRASE_INCLUDE_SPLIT and tok.find('_') >= 0:
                tokens.extend(tok.split('_'))
        documents_phrased.append(Doc(raw=doc.raw, id=doc.id, tokens=tokens))
else:
    documents_phrased = documents

documents = documents_phrased

100%|██████████| 400000/400000 [00:26<00:00, 15286.33it/s]
100%|██████████| 400000/400000 [00:44<00:00, 8970.17it/s] 


Top bigrams
[(0, (b'braxton hick', 4898.825918457243)), (1, (b'tommee tippee', 4611.2339690879935)), (2, (b'cradle cap', 4522.206263802449)), (3, (b'moses basket', 3054.1706013986013)), (4, (b'gestational diabetes', 3033.9288217783755)), (5, (b'san diego', 2201.790349936201)), (6, (b'growth spurt', 2152.6255924916936)), (7, (b'universal credit', 2081.0362438220754)), (8, (b'mucus plug', 1862.695465378392)), (9, (b'pro and con', 1693.8517189758413))]
time: 1min 10s


In [7]:
# remove stopwords
documents_destopped = []
sw = set(STOPWORDS)
for doc in tqdm(documents):
    tokens = [tok for tok in doc.tokens if not tok in sw]
    documents_destopped.append(Doc(tokens=tokens, raw=doc.raw, id=doc.id))

documents = documents_destopped

100%|██████████| 400000/400000 [00:05<00:00, 68282.95it/s] 

time: 5.86 s





In [8]:
# filter terms by CF (>0.1%, <50%)

import collections
terms = collections.defaultdict(lambda: 0)
for doc in tqdm(documents):
    for token in set(doc.tokens):
        terms[token] += 1

print('%d terms' % len(terms))

thr_min = len(documents) * 0.001
thr_max = len(documents) * 0.50

terms_low  = [tok for tok, freq in terms.items() if (freq < thr_min)]
terms_high = [tok for tok, freq in terms.items() if (freq > thr_max)]

print('Removing %d low-frequency terms' % len(terms_low))
print('Removing %d high-frequency terms:' % len(terms_high))

vocabulary = dict([(tok,freq / len(documents)) for tok, freq in terms.items() if (thr_min < freq < thr_max)])
print('%d terms in vocabulary' % len(vocabulary))

documents_filtered_cf = []
vocab_set = set(vocabulary.keys())
for doc in documents:
    tokens = [term for term in doc.tokens if term in vocab_set]
    documents_filtered_cf.append(Doc(tokens=tokens, id=doc.id, raw=doc.raw))

documents = documents_filtered_cf

100%|██████████| 400000/400000 [00:02<00:00, 144249.94it/s]


142233 terms
Removing 139798 low-frequency terms
Removing 0 high-frequency terms:
2430 terms in vocabulary
time: 6.68 s


In [9]:
# lengths = np.array([len(doc) for doc in documents])
# np.percentile(lengths, 10)
# => 4.0

documents_filtered = []
for doc in tqdm(documents):
    if len(doc.tokens) < 4: continue
    documents_filtered.append(doc)
print(f'Retaining {len(documents_filtered)} documents')
documents = documents_filtered


100%|██████████| 400000/400000 [00:00<00:00, 1667867.84it/s]

Retaining 366823 documents
time: 244 ms





In [10]:
# save dataset

import pickle

cutoff = int(len(documents) * (1-TEST_RATIO))
to_tokens = lambda d: d.tokens
to_ids    = lambda d: d.id
to_raw    = lambda d: d.raw

pickle.dump({
    'raw': {
      'train': list(map(to_raw, documents[:cutoff])),
      'test':  list(map(to_raw, documents[cutoff:])),
    },
    'ids': {
      'train': list(map(to_ids, documents[:cutoff])),
      'test':  list(map(to_ids, documents[cutoff:])),
    },
    'tokenised': {
      'train': list(map(to_tokens, documents[:cutoff])),
      'test':  list(map(to_tokens, documents[cutoff:])),
    },
    'vocabulary': vocabulary,
}, open(f'dataset.{DATASET_NAME}.pkl', 'wb'))

time: 4.09 s
