In [9]:
# Parse HTML
from html.parser import HTMLParser
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
from nltk import word_tokenize, WordNetLemmatizer

stoplist = stopwords.words('english')
wnl = WordNetLemmatizer()

class Parser(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def clean(html):
    s = Parser()
    s.feed(html)
    html_text = re.sub("[^a-zA-Z]", " ", s.get_data()).lower().split()
    words = [word for word in html_text if len(word) > 1 and not word in stoplist]
    return(words)

In [10]:
import pandas as pd
import time

start_time = time.time()
questions = pd.read_csv("./data/rquestions/Questions.csv", encoding = 'iso-8859-1')
questions['Title'] = questions['Title'].map(lambda x: clean(x))
questions['Body'] = questions['Body'].map(lambda x: clean(x))
print(str((time.time() - start_time) / 60) + ' minutes')

1.6493119160334269 minutes


In [11]:
import os

#path = '/Users/jingweili/Documents/Github/project/data/rquestions'
#cleaned_data.to_csv(os.path.join(path, r'r_answers_clean.csv'), encoding = 'utf-8', index = False)

In [12]:
tags_raw = pd.read_csv('./data/rquestions/Tags.csv')
tags = tags_raw.groupby('Id')['Tag'].apply(list)
tags = pd.DataFrame({'Id': tags.index, 'Tags': tags.values})
tags.head()

Unnamed: 0,Id,Tags
0,77434,[vector]
1,79709,"[memory, function, global-variables, side-effe..."
2,95007,"[math, statistics]"
3,103312,"[file, file-io, eof]"
4,255697,"[math, statistics, bayesian, dirichlet]"


In [13]:
doc = questions[['Id', 'Body']]
doc = doc.join(tags.set_index('Id'), on = 'Id', how = 'inner')
doc.head()

Unnamed: 0,Id,Body,Tags
0,77434,"[suppose, vector, nested, dataframe, one, two,...",[vector]
1,79709,"[function, inside, loop, inside, function, inn...","[memory, function, global-variables, side-effe..."
2,95007,"[mystified, quantile, function, day, intuitive...","[math, statistics]"
3,103312,"[test, eof, flag, example, file, fname, rb, re...","[file, file-io, eof]"
4,255697,"[looking, package, used, train, dirichlet, pri...","[math, statistics, bayesian, dirichlet]"


In [16]:
from sklearn.cross_validation import train_test_split

train, test = train_test_split(doc, test_size = 0.2)

In [19]:
import gensim
from gensim import corpora, models

dictionary = corpora.Dictionary(doc['Body'])
word_id = dictionary.token2id
corpus_train = [dictionary.doc2bow(text) for text in train['Body']]
corpus_test = [dictionary.doc2bow(text) for text in test['Body']]



In [28]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus_train, num_topics = 50, id2word = dictionary, 
                                           chunksize = 10000, passes = 1, update_every = 1)

In [29]:
print(ldamodel.print_topics(num_topics = 3, num_words = 4))

[(23, '0.025*"data" + 0.025*"plot" + 0.020*"col" + 0.018*"xts"'), (17, '0.014*"name" + 0.012*"rstudio" + 0.010*"com" + 0.010*"server"'), (31, '0.038*"plot" + 0.019*"data" + 0.017*"map" + 0.015*"like"')]


In [40]:
from operator import itemgetter

def get_topic(ldamodel, document):
    topic_prob = ldamodel.get_document_topics(document)
    topic = max(topic_prob, key = itemgetter(1))[0]
    return(topic)

In [60]:
train['Topic'] = [get_topic(ldamodel, doc) for doc in corpus_train]
test['Topic'] = [get_topic(ldamodel, doc) for doc in corpus_test]

In [61]:
train.head()

Unnamed: 0,Id,Body,Tags,Topic
89037,31136471,"[split, matrix, two, matrix, based, column, na...",[matrix],34
69014,27355363,"[character, vars, vars, cogd, relevel, cbsnivc...","[character, gsub]",23
104390,33639556,"[goal, trying, modify, shiny, app, previously,...","[ggplot2, shiny, heatmap]",46
37968,19953898,"[suppose, data, frame, number, factors, varyin...","[dataframe, apply, r-factor]",11
78194,29165425,"[efficient, way, get, rank, minimum, maximum, ...","[optimization, vector, minimum, rank]",49


In [62]:
train.to_pickle('train.pkl')
test.to_pickle('test.pkl')