# Dataset preparation


- Extract from database

- Accent removal [TODO]
- Lowercase
- Simple tokenization (word spliting)
- Lemmatization (before phrasing, to enhance phrasing)
- Bigram/trigram phrase replacement
- Remove too-short documents (low 10%)
- Stop word removal (after phrasing, to include phrases with stopwords) 
- Corpus frequency filtering (remove terms present in >50% documents or only present in <0.1% documents)

- Generate loadable datasets for `sklearn` and `tomotopy`

In [1]:
%load_ext autotime

import psycopg2
import gensim
import sklearn
import nltk
import re
import tomotopy as tp
import numpy as np
import multiprocessing
from tqdm import tqdm
from time import time

DATASET_NAME = '200k-lemma-nophrase'
N_SAMPLES = 200e3
DO_LEMMATIZE = True
DO_PHRASING = False
DO_PHRASE_INCLUDE_SPLIT = True
TEST_RATIO = 0.2

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/mezis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mezis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mezis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mezis/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
print("Connecting to DB...")
conn = psycopg2.connect("dbname='peanut_prod' user='mezis'")
cur = conn.cursor()

print("Loading dataset...")
cur.execute(f"""
    SELECT CONCAT_WS(' ', title, body) AS text
    FROM post p
    JOIN users u ON u.id = p.author_id
    WHERE TRUE
      AND u.account_status = 'active'
      AND p.status = 'active'
    ORDER BY p.id DESC
    LIMIT {int(N_SAMPLES)}
""")
documents_raw = [row[0] for row in cur.fetchall()]

print("%d samples" % len(documents_raw))

Connecting to DB...
Loading dataset...
200000 samples
time: 787 ms


In [3]:
STOPWORDS = nltk.corpus.stopwords.words('english') + [
    'na' # from tokenizing 'wanna', 'gonna'
]

time: 1.88 ms


In [4]:
class MyTokenizer:
    def __init__(self):
        self.wnl = nltk.stem.WordNetLemmatizer()
        self.re = re.compile(r"[a-z0-9']")
        self.map = {
            "'m":     "am",
            "n't":    "not",
            "'d":     "would",
            "'ll":    "will",
            "'ve":    "have"
        }
        
    def __call__(self, doc):
        tokens = []
        for token in nltk.word_tokenize(doc.lower()):
            token = self.map.get(token, token)
            if DO_LEMMATIZE:
                token = self.wnl.lemmatize(token)
            if not self.re.match(token): continue
            tokens.append(token)
        return tokens

##################################

documents_preprocessed = []
tokenizer = MyTokenizer()
with multiprocessing.Pool(processes=16) as pool:
    for tokenized in pool.map(tokenizer, tqdm(documents_raw)):
        documents_preprocessed.append(tokenized)

100%|██████████| 200000/200000 [00:08<00:00, 22974.04it/s]


time: 11.2 s


In [5]:
# lengths = np.array([len(doc) for doc in documents_preprocessed])
# np.percentile(lengths, 10)
# => 8.0

documents_filtered = []
for doc in tqdm(documents_preprocessed):
    if len(doc) < 8: continue
    documents_filtered.append(doc)

100%|██████████| 200000/200000 [00:00<00:00, 2264327.28it/s]

time: 90.5 ms





In [6]:
phrases = gensim.models.phrases.Phrases(
    sentences=tqdm(documents_filtered),
    min_count=200, # only bigrams with this corpus frequency
    threshold=10,  # opaque scoring threshold
    common_terms=STOPWORDS,
    scoring='default'
)

100%|██████████| 182080/182080 [00:12<00:00, 14408.02it/s]

time: 12.6 s





In [7]:
x = list(phrases.export_phrases(tqdm(documents_filtered)))
print('Top bigrams')
list(enumerate(sorted(set(x), key=lambda t: -t[1])))[:100]

100%|██████████| 182080/182080 [00:20<00:00, 8984.70it/s]

Top bigrams





[(0, (b'braxton hick', 3358.4204775272415)),
 (1, (b'moses basket', 2086.01979639733)),
 (2, (b'gestational diabetes', 1532.6169853648114)),
 (3, (b'universal credit', 1382.1513037461982)),
 (4, (b'mucus plug', 1199.4010637552137)),
 (5, (b'stretch mark', 1065.1318046748154)),
 (6, (b'c section', 991.7321090774194)),
 (7, (b'sippy cup', 962.5809561152163)),
 (8, (b'cradle cap', 865.609460458241)),
 (9, (b'social distancing', 814.8740641785644)),
 (10, (b'social medium', 687.6258867840921)),
 (11, (b'potty training', 673.3941719474727)),
 (12, (b'greatly appreciated', 617.7489274913847)),
 (13, (b'mental health', 531.368829573375)),
 (14, (b'growth spurt', 516.793169257245)),
 (15, (b'tommee tippee', 515.0080911841272)),
 (16, (b'car seat', 513.0795906653941)),
 (17, (b'gender reveal', 451.201849235906)),
 (18, (b'fall asleep', 410.78082391087463)),
 (19, (b'health visitor', 409.9537516447388)),
 (20, (b'clear blue', 350.2920426891441)),
 (21, (b'fertile window', 340.905770690175)),
 (2

time: 20.3 s


In [8]:
if DO_PHRASING:
    import re
    re_phrase = re.compile('_')
    documents_phrased = []
    for doc in tqdm(documents_filtered):
        tokens = []
        for tok in phrases[doc]:
            tokens.append(tok)
            if DO_PHRASE_INCLUDE_SPLIT and tok.find('_') >= 0:
                tokens.extend(tok.split('_'))
        documents_phrased.append(tokens)
else:
    documents_phrased = documents_filtered

time: 955 µs


In [9]:
# remove stopwords
documents_destopped = []
for doc in tqdm(documents_phrased):
    doc = [tok for tok in doc if not tok in STOPWORDS]
    documents_destopped.append(doc)

100%|██████████| 182080/182080 [00:15<00:00, 11934.28it/s]

time: 15.3 s





In [10]:
# filter terms by CF (>0.1%, <50%)

import collections
terms = collections.defaultdict(lambda: 0)
for doc in tqdm(documents_destopped):
    for token in set(doc):
        terms[token] += 1

print('%d terms' % len(terms))

thr_min = len(documents_destopped) * 0.001
thr_max = len(documents_destopped) * 0.50

terms_low  = [tok for tok, freq in terms.items() if (freq < thr_min)]
terms_high = [tok for tok, freq in terms.items() if (freq > thr_max)]

print('Removing %d low-frequency terms' % len(terms_low))
print('Removing %d high-frequency terms:' % len(terms_high))

vocabulary = dict([(tok,freq / len(documents_destopped)) for tok, freq in terms.items() if (thr_min < freq < thr_max)])
print('%d terms in vocabulary' % len(vocabulary))

100%|██████████| 182080/182080 [00:01<00:00, 143022.97it/s]


87660 terms
Removing 85057 low-frequency terms
Removing 0 high-frequency terms:
2603 terms in vocabulary
time: 1.32 s


In [11]:
# save dataset (pickle format, good for SkLearn)

import pickle

cutoff = int(len(documents_destopped) * (1-TEST_RATIO))

pickle.dump({
    'train':      documents_destopped[:cutoff],
    'test':       documents_destopped[cutoff:],
    'vocabulary': vocabulary,
}, open(f'dataset.{DATASET_NAME}.pkl', 'wb'))

time: 1.5 s


In [12]:
# top terms
sorted(terms.items(), key=lambda t: -t[1])[:20]

[('anyone', 48391),
 ('baby', 45502),
 ('like', 37060),
 ('ha', 36683),
 ('get', 36406),
 ('wa', 35714),
 ('week', 35463),
 ('know', 33799),
 ('month', 32421),
 ('day', 31349),
 ('time', 30948),
 ('one', 26871),
 ('want', 25867),
 ('old', 25341),
 ('feel', 25146),
 ('would', 24194),
 ('go', 22281),
 ('help', 21307),
 ('really', 21061),
 ('need', 20699)]

time: 582 ms
