In [1]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
import pyLDAvis, pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):


In [11]:
frames = []
for i in range(9):
    newf = pd.read_json("Data/2016/opcomments2016-0" + str(1+i), encoding="utf-8")
    frames.append(newf)
for i in range(3):
    newf = pd.read_json("Data/2016/opcomments2016-" + str(10+i), encoding="utf-8")
    frames.append(newf)

df = pd.concat(frames)
df = df.reset_index(drop=True)
df["date"]= pd.to_datetime(df['created_utc'],unit='s')

In [41]:
df.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'body',
       'controversiality', 'created_utc', 'distinguished', 'edited', 'gilded',
       'id', 'link_id', 'parent_id', 'retrieved_on', 'score', 'stickied',
       'subreddit', 'subreddit_id', 'ups', 'date'],
      dtype='object')

In [19]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=2000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df["body"])

In [23]:
%%time
from empath import Empath
lexicon = Empath()
empath_feat = [lexicon.analyze(post) for post in df['body'].loc[:1000]]

CPU times: user 13.6 s, sys: 267 ms, total: 13.9 s
Wall time: 14.3 s


In [24]:
%%time
lda = LatentDirichletAllocation(n_components=10, random_state=0)
lda.fit(tfidf)



CPU times: user 4min 50s, sys: 1.44 s, total: 4min 52s
Wall time: 4min 58s


In [38]:
#'opioid' in 'opioidssssdfasdfas'
'opioidssssdfasdfas'.find('opioid')

0

In [34]:
opioid, fentanyl, fent, heroin, cocaine, codeine, morphine, methadone, opium, heroin, suboxone

{0: ['fentanyl',
  'fent',
  'heroin',
  'people',
  'drug',
  'drugs',
  'cut',
  'dealers',
  'potent',
  'like',
  'od',
  'cocaine',
  'gt',
  'just',
  'laced',
  'legal',
  'analogues',
  'don',
  'https',
  'dangerous'],
 1: ['percs',
  'codeine',
  'crazy',
  'piece',
  'like',
  'love',
  'nice',
  'pst',
  'reddit',
  'oxy',
  'song',
  'looks',
  'im',
  'fent',
  'future',
  'really',
  'watch',
  'glass',
  'mixing',
  'good'],
 2: ['oxy',
  'lol',
  'tho',
  'pressed',
  'xanax',
  'pills',
  '30mg',
  '10mg',
  'lean',
  'like',
  '30',
  'pill',
  'bars',
  'hydro',
  'fake',
  'dillies',
  'morphine',
  'just',
  'hydros',
  'codeine'],
 3: ['morphine',
  'methadone',
  'clinic',
  'poppy',
  'seeds',
  'tea',
  'test',
  'clinics',
  'juice',
  'nah',
  'opium',
  'heroin',
  'does',
  'seed',
  'fent',
  'drip',
  'suboxone',
  'fuck',
  'yeah',
  'positive'],
 4: ['pain',
  'morphine',
  'just',
  'like',
  'got',
  'time',
  'fentanyl',
  'hospital',
  'didn',
  'g

In [33]:
vocab = tfidf_vectorizer.get_feature_names()

n_top_words = 20
k = 2
topic_words = {}

for topic, comp in enumerate(lda.components_):
    # for the n-dimensional array "arr":
    # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
    # which contains the indices that would sort arr in a descending fashion
    # for the ith element in ranked_array, ranked_array[i] represents the index of the
    # element in arr that should be at the ith index in ranked_array
    # ex. arr = [3,7,1,0,3,6]
    # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
    # word_idx contains the indices in "topic" of the top num_top_words most relevant
    # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)    
    word_idx = np.argsort(comp)[::-1][:n_top_words]

    # store the words most relevant to the topic
    topic_words[topic] = [vocab[i] for i in word_idx]

for topic, words in topic_words.items():
    print('Topic: %d' % topic)
    print('  %s' % ', '.join(words))


Topic: 0
  fentanyl, fent, heroin, people, drug, drugs, cut, dealers, potent, like, od, cocaine, gt, just, laced, legal, analogues, don, https, dangerous
Topic: 1
  percs, codeine, crazy, piece, like, love, nice, pst, reddit, oxy, song, looks, im, fent, future, really, watch, glass, mixing, good
Topic: 2
  oxy, lol, tho, pressed, xanax, pills, 30mg, 10mg, lean, like, 30, pill, bars, hydro, fake, dillies, morphine, just, hydros, codeine
Topic: 3
  morphine, methadone, clinic, poppy, seeds, tea, test, clinics, juice, nah, opium, heroin, does, seed, fent, drip, suboxone, fuck, yeah, positive
Topic: 4
  pain, morphine, just, like, got, time, fentanyl, hospital, didn, gt, surgery, don, doctor, did, ve, said, people, gave, really, know
Topic: 5
  just, like, methadone, oxy, don, ve, people, day, high, really, know, time, good, fent, feel, kratom, heroin, opiates, think, use
Topic: 6
  morphine, hydromorphone, opioid, heroin, schedule, gt, synthetic, kratom, opium, maintenance, opioids, canna

In [26]:
pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vectorizer)

In [28]:
%%time
lda = LatentDirichletAllocation(n_components=20, random_state=0)
lda.fit(tfidf)

CPU times: user 4min 24s, sys: 1.06 s, total: 4min 25s
Wall time: 4min 11s


In [29]:
pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vectorizer)