In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.colors as mcolors
from wordcloud import WordCloud

### Import

In [17]:
df = pd.read_pickle('c:/users/tyler/desktop/texts_docs.pkl')
df = df['tokenize_text']

In [18]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', min_df=3, max_df=0.8)
tfidf = vectorizer.fit_transform(df)
# pd.DataFrame(tfidf.toarray(), index=df, columns=vectorizer.get_feature_names())

## Topic Modelling

Latent Semantic Analysis (LSA), another name for Signular Value Decomposition (SVD)

### How Many Topics

In [19]:
group_number = 6

In [20]:
lsa = TruncatedSVD(group_number)
doc_topic = lsa.fit_transform(tfidf)
lsa.explained_variance_ratio_

array([0.0079274 , 0.00783858, 0.00680286, 0.00571895, 0.00605046,
       0.00580341])

In [21]:
sum(lsa.explained_variance_)

0.03720391952040664

In [22]:
# topic modeling
header = []
for i in range(0, group_number):
    header.append('topic'+str(i))

topic_word = pd.DataFrame(lsa.components_.round(3),
             index = header,
             columns = vectorizer.get_feature_names())

In [23]:
def display_topics(model, feature_names, no_top_words, topic_names=header):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic", ix)
        else:
            print("\nTopic:",topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        

In [24]:
display_topics(lsa, vectorizer.get_feature_names(), 15)


Topic: topic0
lol, yeah, like, good, yes, ok, come, love, oh, haha, think, sound, time, know, work

Topic: topic1
yeah, good, like, haha, come, ok, oh, sound, yes, think, let, love, sound good, know, time

Topic: topic2
yes, good, ok, like, come, love, sound, haha, sound good, oh, time, hey, let, know, home

Topic: topic3
ok, good, come, like, sound, love, sound good, time, know, think, haha, let, home, thank, hey

Topic: topic4
ok, cool, ok cool, yeah, yes, haha ok, ok let, ok ok, ok thank, lol ok, ok pack, cool cool, ok head, oh ok, pack

Topic: topic5
come, home, come home, hey, like, work, wanna, want, wanna come, soon, tonight, home soon, want come, sorry, nice


In [70]:
Vt = pd.DataFrame(doc_topic.round(3), index = df, columns = header)
Vt[Vt[:]['topic5'] > 0.5][140:170]

Unnamed: 0_level_0,topic0,topic1,topic2,topic3,topic4,topic5
tokenize_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
come,0.052,0.05,0.071,0.249,-0.075,0.863
may come,0.052,0.05,0.071,0.249,-0.075,0.863
come,0.052,0.05,0.071,0.249,-0.075,0.863
come to un plaza,0.052,0.05,0.071,0.249,-0.075,0.863
come,0.052,0.05,0.071,0.249,-0.075,0.863
come home,0.05,0.044,0.064,0.205,-0.056,0.608
come,0.052,0.05,0.071,0.249,-0.075,0.863
come,0.052,0.05,0.071,0.249,-0.075,0.863
come late,0.037,0.034,0.048,0.16,-0.048,0.524
come home,0.05,0.044,0.064,0.205,-0.056,0.608


In [26]:
# vectorizer.stop_words_ # see the words removed with min_df and max_df

### Most popular group

In [27]:
#saving this in a moment
data = Vt.idxmax(axis=1)
data = data.reset_index()
data['category'] = data[0]
data.drop(labels=0, axis=1, inplace=True)

In [28]:
bars = data.groupby('category').count().sort_values(by='tokenize_text', ascending=False)
bars

Unnamed: 0_level_0,tokenize_text
category,Unnamed: 1_level_1
topic3,31796
topic0,7479
topic5,2919
topic1,2261
topic2,1824
topic4,1107


## Save

In [29]:
data.to_pickle('c:/users/tyler/desktop/LSA_topics.pkl')

### Most Common Words

In [30]:
# most_common = sorted(list(zip(vectorizer.get_feature_names(), vectorizer.idf_)), key = lambda t: t[1])
# for elem in most_common[0]:
#     print(most_common, '\n')