In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(2018)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from os import listdir
print(listdir("stackexchange/input"))

# read csv (comma separated value) into data
data = pd.read_csv('stackexchange/input/Posts_a.csv')


['Badges.csv', 'CloseAsOffTopicReasonTypes.csv', 'CloseReasonTypes.csv', 'Comments.csv', 'FlagTypes.csv', 'PendingFlags.csv', 'PostFeedback.csv', 'PostHistory.csv', 'PostHistoryTypes.csv', 'PostLinks.csv', 'PostNotices.csv', 'PostNoticeTypes.csv', 'PostsWithDeleted.csv', 'Posts_a.csv', 'Posts_b.csv', 'Posts_c.csv', 'PostTags.csv', 'PostTypes.csv', 'queries001.txt', 'ReviewRejectionReasons.csv', 'ReviewTaskResults.csv', 'ReviewTaskResultTypes.csv', 'ReviewTasks.csv', 'ReviewTaskStates.csv', 'ReviewTaskTypes.csv', 'SuggestedEdits.csv', 'SuggestedEditVotes.csv', 'Tags.csv', 'TagSynonyms.csv', 'Users_a.csv', 'Users_b.csv', 'Votes.csv', 'VoteTypes.csv']


  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
data.head()
data.info()
type(data['Title'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34291 entries, 0 to 34290
Data columns (total 22 columns):
Id                       34291 non-null int64
PostTypeId               34291 non-null int64
AcceptedAnswerId         4925 non-null float64
ParentId                 25503 non-null float64
CreationDate             34291 non-null object
DeletionDate             0 non-null float64
Score                    34291 non-null int64
ViewCount                7837 non-null float64
Body                     34125 non-null object
OwnerUserId              33348 non-null float64
OwnerDisplayName         1824 non-null object
LastEditorUserId         13006 non-null float64
LastEditorDisplayName    38 non-null object
LastEditDate             13037 non-null object
LastActivityDate         34291 non-null object
Title                    7837 non-null object
Tags                     7837 non-null object
AnswerCount              7837 non-null float64
CommentCount             34291 non-null int64
Favorite

pandas.core.series.Series

# TASK: Topic modeling with LDA
Topic modeling is a type of statistical modeling for discovering the abstract “topics” that occur in a collection of documents. Latent Dirichlet Allocation (LDA) is an example of topic model and is used to classify text in a document to a particular topic. It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.

## Source:
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [13]:
# get only the titles of the questions of the datasets 
data_title = data[["Title"]]
data_title = data_title.dropna() 
data_title['index'] = data_title.index
documents = data_title
documents.head()
#type(documents['Title'])


Unnamed: 0,Title,index
0,Are radial contextual menus better than vertic...,0
1,What is an acceptable response time for my aja...,1
5,What is the important aspect to consider when ...,5
7,"What can be done to make a long, multi-step wi...",7
9,What screen vertical resolution should I cater...,9


## Data Preprocessing
Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.

Words that have fewer than 3 characters are removed.

All stopwords are removed.

Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.

Words are stemmed — words are reduced to their root form.

In [14]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Johannes\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# lematize and stem functions
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

#preprocess
processed_docs = documents["Title"].map(preprocess)
processed_docs[:5]

0    [radial, contextu, menus, better, vertic, list...
1                        [accept, respons, time, ajax]
5    [import, aspect, consid, decid, window, intera...
7            [long, multi, step, wizard, user, friend]
9     [screen, vertic, resolut, cater, phone, browser]
Name: Title, dtype: object

In [16]:
# Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.
dictionary = gensim.corpora.Dictionary(processed_docs)

# Gensim filter_extremes
# Filter out tokens that appear in less than 15 documents (absolute number) or
# more than 0.5 documents (fraction of total corpus size, not absolute number). (TODO: why?)
# after the above two steps, keep only the first 100000 most frequent tokens.
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#For each document we create a dictionary reporting how many words and how many times those words appear.
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[:5]

[[(0, 1), (1, 1), (2, 2), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)],
 [(15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(3, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]

In [23]:
# Preview Bag Of Words 
bow_doc_sample = bow_corpus[0]
for i in range(len(bow_doc_sample)):
    print("Word #{} {} appears {} times".format(bow_doc_sample[i][0],
                                                processed_docs[0][i],
                                                bow_doc_sample[i][1]))


Word #0 radial appears 1 times
Word #1 contextu appears 1 times
Word #2 menus appears 2 times
Word #3 better appears 1 times


In [24]:
# Create tf-idf model 
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)

# apply transformation to the entire corpus 
corpus_tfidf = tfidf[bow_corpus]

# preview TF-IDF scores for our first document
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.31955709005753125),
 (1, 0.2727205093107365),
 (2, 0.8109853879304021),
 (3, 0.407197115115801)]


In [35]:
processed_docs[:5]

0    [radial, contextu, menus, better, vertic, list...
1                        [accept, respons, time, ajax]
5    [import, aspect, consid, decid, window, intera...
7            [long, multi, step, wizard, user, friend]
9     [screen, vertic, resolut, cater, phone, browser]
Name: Title, dtype: object

In [44]:
for i in range(4,8):
    print(i,dictionary[i])

4 accept
5 ajax
6 respons
7 time


In [34]:
bow_corpus[:5]

[[(0, 1), (1, 1), (2, 2), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)],
 [(15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(3, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]

In [25]:
# Train our lda model
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [26]:
# For each topic, we will explore the words occuring in that topic and its relative weight.
# Goal is to detect topics based on the words in each group  
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.038*"data" + 0.035*"display" + 0.035*"user" + 0.035*"best" + 0.017*"list" + 0.017*"interfac" + 0.016*"practic" + 0.014*"idea" + 0.014*"icon" + 0.013*"good"
Topic: 1 
Words: 0.064*"page" + 0.056*"best" + 0.030*"form" + 0.027*"button" + 0.025*"differ" + 0.022*"websit" + 0.019*"site" + 0.017*"right" + 0.016*"practic" + 0.015*"link"
Topic: 2 
Words: 0.047*"select" + 0.025*"filter" + 0.024*"time" + 0.021*"button" + 0.019*"dialog" + 0.016*"control" + 0.015*"chang" + 0.013*"style" + 0.012*"user" + 0.012*"progress"
Topic: 3 
Words: 0.059*"user" + 0.025*"window" + 0.019*"form" + 0.018*"button" + 0.018*"applic" + 0.017*"design" + 0.017*"grid" + 0.015*"test" + 0.013*"order" + 0.011*"better"
Topic: 4 
Words: 0.046*"button" + 0.033*"user" + 0.023*"screen" + 0.020*"navig" + 0.019*"applic" + 0.018*"menu" + 0.016*"indic" + 0.015*"color" + 0.013*"mobil" + 0.013*"tab"
Topic: 5 
Words: 0.038*"search" + 0.035*"mobil" + 0.030*"button" + 0.030*"tabl" + 0.023*"text" + 0.018*"action" + 0.01

In [27]:
# Running LDA using TF-IDF
# Again to distinguish different topics using the words in each topic and their corresponding weights
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.048*"user" + 0.030*"experi" + 0.020*"design" + 0.013*"valu" + 0.012*"websit" + 0.011*"menu" + 0.011*"posit" + 0.011*"click" + 0.010*"devic" + 0.010*"button"
Topic: 1 Word: 0.038*"design" + 0.027*"good" + 0.012*"navig" + 0.012*"mobil" + 0.012*"user" + 0.012*"form" + 0.011*"idea" + 0.011*"desktop" + 0.011*"input" + 0.010*"dropdown"
Topic: 2 Word: 0.037*"best" + 0.030*"practic" + 0.024*"page" + 0.020*"user" + 0.015*"indic" + 0.014*"select" + 0.013*"usabl" + 0.012*"form" + 0.012*"design" + 0.012*"pattern"
Topic: 3 Word: 0.020*"search" + 0.017*"user" + 0.017*"display" + 0.017*"list" + 0.017*"button" + 0.016*"data" + 0.013*"option" + 0.012*"tabl" + 0.011*"best" + 0.011*"research"
Topic: 4 Word: 0.029*"interfac" + 0.019*"user" + 0.017*"form" + 0.012*"prototyp" + 0.012*"word" + 0.012*"touch" + 0.012*"tab" + 0.011*"tool" + 0.011*"multipl" + 0.011*"vertic"
Topic: 5 Word: 0.016*"view" + 0.015*"right" + 0.014*"mobil" + 0.012*"navig" + 0.012*"site" + 0.011*"date" + 0.011*"filter" +

In [28]:
# Performance evaluation by classifying sample document using LDA Bag of Words model
# We will check where our test document would be classified.
processed_docs[0]

['radial', 'contextu', 'menus', 'better', 'vertic', 'list', 'menus']

In [30]:
for index, score in sorted(lda_model[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5636433959007263	 
Topic: 0.046*"button" + 0.033*"user" + 0.023*"screen" + 0.020*"navig" + 0.019*"applic" + 0.018*"menu" + 0.016*"indic" + 0.015*"color" + 0.013*"mobil" + 0.013*"tab"

Score: 0.3029978275299072	 
Topic: 0.075*"user" + 0.025*"field" + 0.019*"icon" + 0.018*"content" + 0.014*"menu" + 0.013*"list" + 0.013*"manag" + 0.013*"item" + 0.013*"label" + 0.012*"navig"

Score: 0.01667226292192936	 
Topic: 0.035*"field" + 0.033*"list" + 0.032*"form" + 0.026*"messag" + 0.025*"user" + 0.018*"improv" + 0.017*"input" + 0.016*"button" + 0.015*"long" + 0.014*"error"

Score: 0.016671394929289818	 
Topic: 0.059*"user" + 0.025*"window" + 0.019*"form" + 0.018*"button" + 0.018*"applic" + 0.017*"design" + 0.017*"grid" + 0.015*"test" + 0.013*"order" + 0.011*"better"

Score: 0.01667131297290325	 
Topic: 0.038*"search" + 0.035*"mobil" + 0.030*"button" + 0.030*"tabl" + 0.023*"text" + 0.018*"action" + 0.016*"user" + 0.016*"click" + 0.013*"devic" + 0.013*"navig"

Score: 0.016669973731040955	 

In [31]:
# Performance evaluation by classifying sample document using LDA TF-IDF model.
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8499661087989807	 
Topic: 0.029*"interfac" + 0.019*"user" + 0.017*"form" + 0.012*"prototyp" + 0.012*"word" + 0.012*"touch" + 0.012*"tab" + 0.011*"tool" + 0.011*"multipl" + 0.011*"vertic"

Score: 0.01667286641895771	 
Topic: 0.030*"button" + 0.025*"icon" + 0.017*"item" + 0.015*"text" + 0.014*"best" + 0.014*"websit" + 0.013*"languag" + 0.013*"user" + 0.013*"list" + 0.010*"usabl"

Score: 0.016672568395733833	 
Topic: 0.022*"button" + 0.020*"list" + 0.018*"display" + 0.014*"search" + 0.014*"time" + 0.013*"usabl" + 0.012*"content" + 0.012*"page" + 0.012*"data" + 0.012*"modal"

Score: 0.016671955585479736	 
Topic: 0.020*"search" + 0.017*"user" + 0.017*"display" + 0.017*"list" + 0.017*"button" + 0.016*"data" + 0.013*"option" + 0.012*"tabl" + 0.011*"best" + 0.011*"research"

Score: 0.016670648008584976	 
Topic: 0.023*"user" + 0.017*"applic" + 0.015*"android" + 0.014*"test" + 0.014*"interfac" + 0.014*"best" + 0.012*"page" + 0.012*"process" + 0.012*"recommend" + 0.011*"form"

Score: 0.

In [32]:
# testing model on unseen data
unseen_document = 'How to modify an image to make it clear it\'s an image (and not a clickable button.)'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6299756765365601	 Topic: 0.046*"button" + 0.033*"user" + 0.023*"screen" + 0.020*"navig" + 0.019*"applic"
Score: 0.23666976392269135	 Topic: 0.036*"best" + 0.034*"multipl" + 0.024*"user" + 0.019*"websit" + 0.019*"result"
Score: 0.016674991697072983	 Topic: 0.064*"page" + 0.056*"best" + 0.030*"form" + 0.027*"button" + 0.025*"differ"
Score: 0.016670355573296547	 Topic: 0.038*"search" + 0.035*"mobil" + 0.030*"button" + 0.030*"tabl" + 0.023*"text"
Score: 0.016668813303112984	 Topic: 0.059*"user" + 0.025*"window" + 0.019*"form" + 0.018*"button" + 0.018*"applic"
Score: 0.016668668016791344	 Topic: 0.096*"design" + 0.078*"user" + 0.028*"usabl" + 0.027*"good" + 0.021*"mobil"
Score: 0.016668183729052544	 Topic: 0.038*"data" + 0.035*"display" + 0.035*"user" + 0.035*"best" + 0.017*"list"
Score: 0.016667930409312248	 Topic: 0.047*"select" + 0.025*"filter" + 0.024*"time" + 0.021*"button" + 0.019*"dialog"
Score: 0.016667908057570457	 Topic: 0.075*"user" + 0.025*"field" + 0.019*"icon" + 0.018