# Topic extraction

In [1]:
import psycopg2
from time import time
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

n_samples = 200000
n_features = 500

In [2]:
print("Connecting to DB...")
conn = psycopg2.connect("dbname='peanut_prod' user='mezis' host='frolic.local' password='av85ck4g'")
cur = conn.cursor()

print("Loading dataset...")
t0 = time()
cur.execute(f"""
    SELECT CONCAT_WS(' ', title, body) AS text
    FROM post p
    JOIN users u ON u.id = p.author_id
    --JOIN post_stats ps ON ps.id = p.id AND ps.view_count > 50
    WHERE TRUE
      AND u.account_status = 'active'
      AND p.status = 'active'
    ORDER BY p.id DESC
    LIMIT {n_samples}
""")
data_samples = [row[0] for row in cur.fetchall()]

print("done in %0.3fs." % (time() - t0))
print("%d samples" % len(data_samples))

Connecting to DB...
Loading dataset...
done in 0.989s.
200000 samples


In [3]:
print("Preparing text preprocessing...")
import re

t0 = time()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

class MyPreprocessor:
    def __call__(self, doc):
        return doc.lower()

class MyTokenizer:
    def __init__(self):
        self.wnl = nltk.stem.WordNetLemmatizer()
        self.stopwords = frozenset(
            nltk.corpus.stopwords.words('english') + """
                n't
                anyone
                everyone
                feel like want get
            """.split()
        )
        self.re = re.compile(r"[a-z0-9]")
        
    def __call__(self, doc):
        return [token for token in [self.filter(token) for token in self.tokenize(doc)] if token]
    
    def tokenize(self, doc):
        return nltk.word_tokenize(doc.lower());
    
    def filter(self, token):
        if token in self.stopwords:
            return
        if not self.re.match(token):
            return
        return self.wnl.lemmatize(token)
        

print("done in %0.3fs." % (time() - t0))

test_string = "My mother-in-law isn't, really, as lovely as I'd like her to be."
print(MyTokenizer()(test_string))

print(nltk.pos_tag(nltk.word_tokenize('let me know')))
      
# for doc in data_samples[-3:]:
#     print(MyTokenizer()(doc))


Preparing text preprocessing...
done in 0.146s.


[nltk_data] Downloading package punkt to /home/mezis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mezis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mezis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mezis/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['mother-in-law', 'really', 'lovely']
[('let', 'VB'), ('me', 'PRP'), ('know', 'VB')]


In [4]:
print("Extracting 2-grams...")
gram2 = CountVectorizer(max_df=0.90, min_df=0.001,
                                max_features=n_features,
                                stop_words=None,
                                ngram_range=(2,2),
                                tokenizer=MyTokenizer())
t0 = time()
%time tf = gram2.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting 2-grams...
CPU times: user 1min 40s, sys: 272 ms, total: 1min 40s
Wall time: 1min 40s
done in 100.775s.


In [5]:
counts = np.sum(tf, axis=0)
terms = gram2.get_feature_names()
terms_counts = []
for k in range(n_features):
    terms_counts.append((terms[k], counts[0,k]))
terms_counts.sort(key=lambda t: -t[1])
terms_counts

[('month old', 13725),
 ('year old', 6994),
 ('little one', 5601),
 ('week pregnant', 5135),
 ('first time', 4587),
 ('6 month', 3957),
 ('week old', 3766),
 ('4 month', 3251),
 ('3 month', 3071),
 ('wan na', 2899),
 ('go back', 2715),
 ('would love', 2646),
 ('last night', 2597),
 ('baby girl', 2534),
 ('5 month', 2437),
 ('2 week', 2417),
 ('let know', 2393),
 ('6 week', 2377),
 ('stay home', 2328),
 ('hi lady', 2316),
 ('little girl', 2231),
 ('2 year', 2113),
 ('back work', 2080),
 ('2 month', 2057),
 ('little boy', 1983),
 ('gon na', 1954),
 ('7 month', 1951),
 ('week ago', 1932),
 ('8 month', 1927),
 ('9 month', 1865),
 ('play date', 1851),
 ('last week', 1844),
 ('12 week', 1820),
 ('hey lady', 1801),
 ('thank much', 1801),
 ('need help', 1799),
 ('3 day', 1742),
 ('please help', 1735),
 ('baby boy', 1725),
 ('every time', 1718),
 ('2 day', 1712),
 ('old baby', 1697),
 ('3 week', 1639),
 ('every day', 1628),
 ('even though', 1613),
 ('c section', 1612),
 ('8 week', 1559),
 ('nex

In [6]:
print("Extracting 3-grams...")
gram3 = CountVectorizer(max_df=0.90, min_df=0.001,
                                max_features=n_features,
                                stop_words=None,
                                ngram_range=(3,3),
                                tokenizer=MyTokenizer())
t0 = time()
%time tf = gram3.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting 3-grams...
CPU times: user 1min 42s, sys: 360 ms, total: 1min 42s
Wall time: 1min 42s
done in 102.369s.


In [10]:
counts = np.sum(tf, axis=0)
terms = gram3.get_feature_names()
terms_counts = []
print(len(terms))
print(counts.shape)
for k in range(len(terms)):
    terms_counts.append((terms[k], counts[0,k]))
terms_counts.sort(key=lambda t: -t[1])
terms_counts

72
(1, 72)


[('6 month old', 1420),
 ('4 month old', 1327),
 ('3 month old', 1245),
 ('2 year old', 1205),
 ('5 month old', 1119),
 ('1 year old', 1104),
 ('8 month old', 974),
 ('7 month old', 935),
 ('stay home mom', 931),
 ('9 month old', 918),
 ('month old baby', 893),
 ('first time mom', 889),
 ('go back work', 795),
 ('3 year old', 779),
 ('2 month old', 769),
 ('10 month old', 716),
 ('first time mum', 677),
 ('please let know', 661),
 ('going back work', 539),
 ('one year old', 516),
 ('12 week scan', 495),
 ('4 year old', 487),
 ('20 week scan', 480),
 ('11 month old', 466),
 ('week old baby', 441),
 ('month old son', 437),
 ('month old daughter', 433),
 ('3 week old', 430),
 ('6 week old', 429),
 ('happy mother day', 423),
 ('work full time', 422),
 ('year old son', 416),
 ('year old daughter', 410),
 ('go back sleep', 368),
 ('would greatly appreciated', 359),
 ('long story short', 349),
 ('7 week old', 342),
 ('2 week ago', 328),
 ('month old sleep', 328),
 ('two year old', 321),
 ('wo