# process data & feature extract

In [178]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
import re
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models


In [60]:
# read sample data
df = pd.read_csv('input/stack_ds_4_9_2017 .csv',sep=',',quotechar='|',header=None)
df.columns = ['title','body','tags']
df.head(2)

Unnamed: 0,title,body,tags
0,How is viewport size typically passed as an ar...,I'm looking at the docs for a command line too...,<html><web><command-line>
1,mysqli_query insert success and throws error,This issue has baffled me. I have a simple mys...,<php><mysqli>


**notice that the data is already filtered with code and images etc.**

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4533 entries, 0 to 4532
Data columns (total 3 columns):
title    4533 non-null object
body     4533 non-null object
tags     4533 non-null object
dtypes: object(3)
memory usage: 106.3+ KB


In [62]:
# merge title and body so that we only have one feature to consider
merged = [ title + ' ' + body for title, body in zip(df.title,df.body)]
df_merged = pd.DataFrame({'content':merged,'tags':df.tags})
df_merged.head(2)

Unnamed: 0,content,tags
0,How is viewport size typically passed as an ar...,<html><web><command-line>
1,mysqli_query insert success and throws error T...,<php><mysqli>


In [157]:
# how many tags we have?
all_tags = df.tags.apply(lambda x: x.replace('<','').split('>'))
all_tags = [x for x in list(itertools.chain(*all_tags)) if x ]
ct = Counter(all_tags)
print(len(ct),'out of',len(df))

3591 out of 4533


**tokenize **
- lowercase 
- topwords
- remove if not character
- stemming (seems not that good)

In [161]:
to_be_removed = set(stopwords.words('english'))
tokenizer = lambda x : [word.lower() for word in word_tokenize(re.sub("[^a-zA-Z]"," ",x)) if word.lower() not in to_be_removed]
# tokenizer = lambda x : [ SnowballStemmer('english').stem(word.lower()) for word in word_tokenize(re.sub("[^a-zA-Z]"," ",x)) if word.lower() not in to_be_removed]

**apply tf-idf**

In [162]:
%%time
# para to be tweaked, we start from a simple one
tfidf = TfidfVectorizer(min_df=0.001,max_df=0.95, max_features=None, tokenizer= tokenizer, ngram_range=(1,2))
tfidf_trained = tfidf.fit_transform(list(df_merged.content))

df_tfidf = pd.DataFrame({'token':tfidf.get_feature_names(),'tfidf_value':tfidf.idf_})

CPU times: user 2.72 s, sys: 12 ms, total: 2.73 s
Wall time: 2.74 s


In [174]:
df_tfidf.sort_values('tfidf_value',ascending=False).set_index('token').head()

Unnamed: 0_level_0,tfidf_value
token,Unnamed: 1_level_1
aa,7.6276
issue code,7.6276
invisible,7.6276
invoke native,7.6276
ios swift,7.6276


**simple LDA**

In [165]:
# build a look up dict, where every unique word is mapped to a unique int
texts = df_merged.content.apply(tokenizer)
dictionary = corpora.Dictionary(texts)

In [170]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [173]:
corpus[0][:10] # first 10 words in first sentence : word 0 occurs once, word 8 occurs twice

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 2),
 (9, 1)]

In [180]:
%%time
# id2word: required. The LdaModel class requires our previous dictionary to map ids to strings.
# passes: optional. The number of laps the model will take through corpus. The greater the number of passes, the more accurate the model will be. A lot of passes can be slow on a very large corpus.
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

CPU times: user 1min 25s, sys: 0 ns, total: 1min 25s
Wall time: 1min 25s


In [181]:
ldamodel.print_topics(num_topics=3, num_words=10)

[(0,
  '0.010*"file" + 0.010*"using" + 0.010*"error" + 0.007*"app" + 0.006*"code" + 0.006*"server" + 0.006*"get" + 0.006*"use" + 0.006*"java" + 0.005*"application"'),
 (1,
  '0.011*"code" + 0.010*"like" + 0.009*"using" + 0.009*"data" + 0.009*"want" + 0.008*"get" + 0.006*"value" + 0.006*"one" + 0.005*"use" + 0.005*"table"'),
 (2,
  '0.109*"gt" + 0.104*"lt" + 0.026*"class" + 0.025*"div" + 0.012*"li" + 0.009*"p" + 0.008*"span" + 0.006*"amp" + 0.006*"img" + 0.006*"src"')]

well, interesting! did't expect this, we can clearly see that first topic is about ' java application' sencond 'data science' and third 'web programming' :)

In [195]:
# each document has several topics
[ldamodel.get_document_topics(corpus)[i] for i in range(20)]

[[(0, 0.19550704690128598),
  (1, 0.53599704209050969),
  (2, 0.26849591100820436)],
 [(0, 0.19588895201654802), (1, 0.79898438185014875)],
 [(0, 0.36679144274699133), (1, 0.62486013772567239)],
 [(1, 0.98937853198442927)],
 [(0, 0.986926102923949)],
 [(0, 0.47525538856717775), (1, 0.519279176261919)],
 [(0, 0.32184300478390188), (1, 0.67265188639212181)],
 [(0, 0.11058666899631907),
  (1, 0.64263933851495258),
  (2, 0.24677399248872836)],
 [(1, 0.99371624335500575)],
 [(0, 0.86465473159512529),
  (1, 0.12393820858975782),
  (2, 0.011407059815116921)],
 [(0, 0.52755204648382714), (1, 0.46272688182800692)],
 [(1, 0.96025351674437598), (2, 0.032153327054220047)],
 [(1, 0.98592532477743444)],
 [(0, 0.87275044247265332),
  (1, 0.017909980661175162),
  (2, 0.10933957686617148)],
 [(0, 0.78338215150403812), (1, 0.20859400565970859)],
 [(0, 0.30884184967608569), (1, 0.68339444486673051)],
 [(0, 0.29340659218600079), (1, 0.43244814095424161), (2, 0.2741452668597576)],
 [(0, 0.32236661433936847


[lda source](https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html)

TO DO:
- remove verb

problems:
- propagate tags from LDA ? increase error? is this necessary?
- lda is really slow even on a small part of data
- will deep learning be possible?