In [17]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

import connect_to_db as cn

In [18]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [19]:
nlp = spacy.load('en_core_web_sm')

In [21]:
sql = 'select comment_key, body from comments c, nodes n where c.author = n.node_id and c.is_valid=1 and n.community_id_fastgreedy_is=0 and c.link_key = c.parent_key;'
result_df = cn.select_query_result_to_df(sql)
corpus = list(np.array(result_df['body'].astype(str).values.tolist()))

In [6]:
# the complete corpus as below:

corpus

["Instead of coming to a reasonable solution he's decided to use your relationship as a weapon for his decision. Its reasonable to ask that if you do smoke that its kept out of the house, not mentioned and done so safely.  Its not reasonable to threaten leaving you if you smoke weed. You got to consider whats the next thing he will use?",
 'My cat always comes to me when Iâ€™m sad. My dog cannot recognize my emotions nearly as much. Cats are great. Animals in general are a godsend',
 'Do you think itâ€™s worth mentioning to my psych?? or just ignoring it since it doesnâ€™t happen too often',
 'I just read your user lol, thank you',
 "Oh...I'm sorry. What are your ideas?",
 'great thing that you realized what was happening. my father ended up in psychosis and later with schizophrenia after he used weed daily. everyone that has psychosis in the family should be very careful.',
 'Iâ€™m trying  Iâ€™m just not sure how much longer I can take the pain.',
 'Dm me?',
 'Hey! I live in Washingto

In [7]:
# Apply Preprocessing on the Corpus

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation, 구두점 제거.
exclude = set(string.punctuation) 

# lemmatization, 표제어 추출. (am, are, is -> be, ed, s 등 제거.)
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

In [8]:
# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]

clean_corpus

[['instead',
  'coming',
  'reasonable',
  'solution',
  'he',
  'decided',
  'use',
  'relationship',
  'weapon',
  'decision',
  'reasonable',
  'ask',
  'smoke',
  'kept',
  'house',
  'mentioned',
  'done',
  'safely',
  'reasonable',
  'threaten',
  'leaving',
  'smoke',
  'weed',
  'got',
  'consider',
  'whats',
  'next',
  'thing',
  'use'],
 ['cat',
  'always',
  'come',
  'iâ€™m',
  'sad',
  'dog',
  'cannot',
  'recognize',
  'emotion',
  'nearly',
  'much',
  'cat',
  'great',
  'animal',
  'general',
  'godsend'],
 ['think',
  'itâ€™s',
  'worth',
  'mentioning',
  'psych',
  'ignoring',
  'since',
  'doesnâ€™t',
  'happen',
  'often'],
 ['read', 'user', 'lol', 'thank'],
 ['ohim', 'sorry', 'idea'],
 ['great',
  'thing',
  'realized',
  'happening',
  'father',
  'ended',
  'psychosis',
  'later',
  'schizophrenia',
  'used',
  'weed',
  'daily',
  'everyone',
  'psychosis',
  'family',
  'careful'],
 ['iâ€™m', 'trying', 'iâ€™m', 'sure', 'much', 'longer', 'take', 'pain'],
 

In [9]:
# Converting text into numerical representation
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)

# Converting text into numerical representation
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

In [10]:
# Array from TF-IDF Vectorizer 
tf_idf_arr = tf_idf_vectorizer.fit_transform(clean_corpus)

# Array from Count Vectorizer 
cv_arr = cv_vectorizer.fit_transform(clean_corpus)

In [11]:
# this is our converted text to numerical representation from the Tf-IDF vectorizer

tf_idf_arr

<688779x191183 sparse matrix of type '<class 'numpy.float64'>'
	with 16287336 stored elements in Compressed Sparse Row format>

In [12]:
# this is our converted text to numerical representation from the Count vectorizer
cv_arr

<688779x191183 sparse matrix of type '<class 'numpy.int64'>'
	with 16287336 stored elements in Compressed Sparse Row format>

In [13]:
# Creating vocabulary array which will represent all the corpus 
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()

# get the vocb list
vocab_tf_idf

['\x08getting',
 '\x10calmly',
 '\x10i',
 '0',
 '00',
 '000',
 '0000',
 '00000',
 '00000000000000000000000000000000006',
 '000000000000000001',
 '0000000000000000057',
 '000000000000001',
 '00000000000001',
 '0000000000001',
 '00000000001',
 '000000001',
 '00000001',
 '0000001',
 '0000006',
 '000001',
 '0000017',
 '000005',
 '00001',
 '000014',
 '000014285714',
 '000015',
 '000035',
 '00004',
 '00005',
 '0001',
 '0002',
 '000225',
 '00025',
 '0003',
 '00048',
 '0006',
 '0008',
 '00082',
 '0009gal',
 '001',
 '0011',
 '00142',
 '0016',
 '00182548ish',
 '0019',
 '002',
 '0022',
 '0029',
 '003',
 '0030',
 '003125',
 '0035',
 '0037',
 '00375',
 '004',
 '005',
 '005x010x005ã—030x030',
 '006',
 '007',
 '0078202577577',
 '008',
 '009',
 '00923443200080',
 '0093',
 '00989127181037',
 '00s',
 '00â€™s',
 '01',
 '010',
 '0100',
 '01000',
 '0101',
 '0102',
 '0103',
 '011010',
 '0111',
 '0112',
 '012',
 '0125',
 '0125mg',
 '013',
 '0130',
 '014',
 '0142',
 '0145',
 '0145394000',
 '014833888',
 '015'

In [14]:
# Creating vocabulary array which will represent all the corpus 
vocab_cv = cv_vectorizer.get_feature_names()

# get the vocb list
vocab_cv

['\x08getting',
 '\x10calmly',
 '\x10i',
 '0',
 '00',
 '000',
 '0000',
 '00000',
 '00000000000000000000000000000000006',
 '000000000000000001',
 '0000000000000000057',
 '000000000000001',
 '00000000000001',
 '0000000000001',
 '00000000001',
 '000000001',
 '00000001',
 '0000001',
 '0000006',
 '000001',
 '0000017',
 '000005',
 '00001',
 '000014',
 '000014285714',
 '000015',
 '000035',
 '00004',
 '00005',
 '0001',
 '0002',
 '000225',
 '00025',
 '0003',
 '00048',
 '0006',
 '0008',
 '00082',
 '0009gal',
 '001',
 '0011',
 '00142',
 '0016',
 '00182548ish',
 '0019',
 '002',
 '0022',
 '0029',
 '003',
 '0030',
 '003125',
 '0035',
 '0037',
 '00375',
 '004',
 '005',
 '005x010x005ã—030x030',
 '006',
 '007',
 '0078202577577',
 '008',
 '009',
 '00923443200080',
 '0093',
 '00989127181037',
 '00s',
 '00â€™s',
 '01',
 '010',
 '0100',
 '01000',
 '0101',
 '0102',
 '0103',
 '011010',
 '0111',
 '0112',
 '012',
 '0125',
 '0125mg',
 '013',
 '0130',
 '014',
 '0142',
 '0145',
 '0145394000',
 '014833888',
 '015'

In [15]:
display(len(vocab_tf_idf))
display(len(vocab_cv))

191183

191183

In [16]:
# Implementation of LDA:
    
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = LatentDirichletAllocation(n_components = 10, max_iter = 20, random_state = 20)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model.fit_transform(tf_idf_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model.components_

In [119]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 10

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['best' 'journey' 'wish' 'person' 'aircon' 'climate' 'lol' 'hot' 'whole']
Topic 2 ['im' 'glad' 'would' 'make' 'thats' 'come' 'happy' 'face' 'someone']
Topic 3 ['thanks' 'advice' 'yeah' 'feel' 'really' 'too' 'trying' 'still' 'feeling']
Topic 4 ['people' 'happy' 'it' 'friend' 'climb' 'today' 'something' 'know'
 'needed']
Topic 5 ['appreciate' 'thanks' 'hey' 'support' 'yea' 'well' 'thing' 'always'
 'better']
Topic 6 ['time' 'champ' 'sometimes' 'tell' 'take' 'thing' 'got' 'sun' 'happened']
Topic 7 ['step' 'huge' 'weed' 'luck' 'like' 'youits' 'mad' 'reflection' 'them']
Topic 8 ['itâ€™s' 'nice' 'thank' 'struggle' 'know' 'kid' 'story' 'hug' 'sending']
Topic 9 ['im' 'get' 'you' 'want' 'work' 'life' 'thing' 'lot' 'try']
Topic 10 ['people' 'real' 'mvp' 'ive' 'im' 'ðÿ˜‘' 'care' 'depressed' 'irl']


In [120]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(tf_idf_arr)  

# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    print ("Document", n+1, " -- Topic:" ,topic_doc+1)

Document 1  -- Topic: 8
Document 2  -- Topic: 9
Document 3  -- Topic: 9
Document 4  -- Topic: 8
Document 5  -- Topic: 2
Document 6  -- Topic: 10
Document 7  -- Topic: 2
Document 8  -- Topic: 4
Document 9  -- Topic: 5
Document 10  -- Topic: 4
Document 11  -- Topic: 10
Document 12  -- Topic: 6
Document 13  -- Topic: 2
Document 14  -- Topic: 3
Document 15  -- Topic: 9
Document 16  -- Topic: 2
Document 17  -- Topic: 3
Document 18  -- Topic: 9
Document 19  -- Topic: 6
Document 20  -- Topic: 9
Document 21  -- Topic: 8
Document 22  -- Topic: 5
Document 23  -- Topic: 9
Document 24  -- Topic: 3
Document 25  -- Topic: 7
Document 26  -- Topic: 4
Document 27  -- Topic: 5
Document 28  -- Topic: 2
Document 29  -- Topic: 7
Document 30  -- Topic: 4
Document 31  -- Topic: 9
Document 32  -- Topic: 10
Document 33  -- Topic: 7
Document 34  -- Topic: 4
Document 35  -- Topic: 7
Document 36  -- Topic: 4
Document 37  -- Topic: 9
Document 38  -- Topic: 6
Document 39  -- Topic: 4
Document 40  -- Topic: 6
Docume