In [2]:
# Parse HTML
from html.parser import HTMLParser
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter

stoplist = stopwords.words('english')

class Parser(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def clean(html):
    s = Parser()
    s.feed(html)
    html_text = re.sub("[^a-zA-Z]", " ", s.get_data()).lower().split()
    words = [word for word in html_text if len(word) > 1 and not word in stoplist]
    return(words)

In [3]:
# Filter tags by counts (count > 5)

import pandas as pd

tags_raw = pd.read_csv("./data/rquestions/Tags.csv", encoding = 'iso-8859-1')
tags = tags_raw.groupby('Tag').filter(lambda count: len(count) > 5)
doc_tag = tags_raw.groupby('Id')['Tag'].apply(list)
doc_tag = pd.DataFrame({'Id': doc_tag.index, 'Tags': doc_tag.values})

In [4]:
import time

start_time = time.time()
questions = pd.read_csv("./data/rquestions/Questions.csv", encoding = 'iso-8859-1')
doc = questions[['Id', 'Body']]
doc['Body'] = doc['Body'].map(lambda x: clean(x))
doc = doc.join(doc_tag.set_index('Id'), on = 'Id', how = 'inner')
print(str((time.time() - start_time) / 60) + ' minutes')
doc.head()

1.5294268170992533 minutes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Id,Body,Tags
0,77434,"[suppose, vector, nested, dataframe, one, two,...",[vector]
1,79709,"[function, inside, loop, inside, function, inn...","[memory, function, global-variables, side-effe..."
2,95007,"[mystified, quantile, function, day, intuitive...","[math, statistics]"
3,103312,"[test, eof, flag, example, file, fname, rb, re...","[file, file-io, eof]"
4,255697,"[looking, package, used, train, dirichlet, pri...","[math, statistics, bayesian, dirichlet]"


In [6]:
from sklearn.cross_validation import train_test_split

train, test = train_test_split(doc, test_size = 0.2)

In [7]:
# Fit LDA model

import gensim
from gensim import corpora, models

dictionary = corpora.Dictionary(doc['Body'])
word_id = dictionary.token2id
corpus_train = [dictionary.doc2bow(text) for text in train['Body']]
corpus_test = [dictionary.doc2bow(text) for text in test['Body']]

start_time = time.time()
ldamodel = gensim.models.ldamodel.LdaModel(corpus_train, num_topics = 50, id2word = dictionary, 
                                           chunksize = 10000, passes = 20, update_every = 1)
print(str((time.time() - start_time) / 60) + ' minutes')



52.023739230632785 minutes


In [8]:
print(ldamodel.print_topics(num_topics = 3, num_words = 4))

[(10, '0.114*"file" + 0.069*"csv" + 0.058*"read" + 0.038*"files"'), (17, '0.319*"id" + 0.172*"name" + 0.077*"value" + 0.019*"ids"'), (48, '0.020*"cd" + 0.017*"ms" + 0.014*"ap" + 0.013*"cs"')]


In [9]:
import pickle

# Save LDA model to disk
# pickle.dump(ldamodel, open('./data/rquestions/lda_model.sav', 'wb'))

# Load LDA from disk
# loaded_model = pickle.load(open('./data/rquestions/lda_model.sav', 'rb'))

In [12]:
# Find topics of documents

from operator import itemgetter

start_time = time.time()
def get_topic(ldamodel, document):
    topic_prob = ldamodel[document]
    topic = max(topic_prob, key = itemgetter(1))[0]
    return(topic)

train['Topic'] = [get_topic(ldamodel, doc) for doc in corpus_train]
test['Topic'] = [get_topic(ldamodel, doc) for doc in corpus_test]
print(str((time.time() - start_time) / 60) + ' minutes')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


95.03186736504237 minutes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
train.head()

Unnamed: 0,Id,Body,Tags,Topic
71073,27854248,"[trying, figure, write, csv, based, selections...","[csv, shiny]",37
144036,39715958,"[list, data, frames, column, names, want, subs...","[list, lapply, gsub]",20
57641,24912042,"[seems, lot, functionality, around, eval, leas...","[string, eval]",11
103704,33539856,"[experiment, get, large, files, time, series, ...","[pdf, ggplot2]",2
61076,25668646,"[trying, install, predictionet, troubles, clea...","[windows, installation]",23


In [14]:
# Save and load dataset

# train.to_pickle('./data/rquestions/train.pkl')
# test.to_pickle('./data/rquestions/test.pkl')

# train = pd.read_pickle('./data/rquestions/train.pkl')
# test = pd.read_pickle('./data/rquestions/test.pkl')

In [21]:
# Recommend tags by KNN

def distance(a, b):
    intersect = list(set(a) & set(b))
    union = list(set(a) | set(b))
    return(len(intersect) / len(union))

def recommend_tag(test_row, num_tags, num_neighbor):
    test_id = test_row['Id']
    test_body = test_row['Body']
    test_topic = test_row['Topic']
    test_tag = test_row['Tags']
    train_topic = train[train['Topic'] == test_topic]
    
    # Calculate distance between test and documents in training set
    dist = train_topic['Body'].map(lambda x: distance(test_body, x))
    
    # Find the closet documents
    closet_index = dist.nlargest(num_neighbor).index.values.tolist()
    closet_id = train_topic.loc[closet_index]['Id'].tolist()
    
    # Recommend the most common tags
    tag_list = tags[tags['Id'].isin(closet_id)]['Tag'].tolist()
    sorted_tags = [tag for tag, count in Counter(tag_list).most_common()]
    top_tags = sorted_tags[:num_tags]
    return(pd.Series([test_id, test_body, test_tag, top_tags]))

In [23]:
start_time = time.time()
predict = test.apply(lambda x: recommend_tag(x, 4, 15), axis = 1)
predict.columns = ['Id', 'Body', 'Actual Tags', 'Recommend Tags']
print(str((time.time() - start_time) / 60) + ' minutes')
predict.head(20)

118.53424481550853 minutes


Unnamed: 0,Id,Body,Actual Tags,Recommend Tags
121408,36361425,"[struggling, developing, bubble, chart, plotly...",[plotly],"[plotly, googlevis, shiny, rmarkdown]"
56385,24626769,"[plotting, stacked, bar, graph, use, geom, tex...","[ggplot2, geom-text]","[ggplot2, geom-bar, geom-text, dataframe]"
111850,34855721,"[two, data, frame, df, df, ncol, nrow, col, co...",[dataframe],"[dataframe, list, plyr, datatable]"
125165,36889656,"[two, data, frames, dput, data, frames, given,...","[search, data-manipulation]","[time, dataframe, matching, data.table]"
120460,36206168,"[time, series, climate, data, years, base, plo...","[ggplot2, time-series, zoo]","[plot, ggplot2, time-series, zoo]"
108412,34259208,"[trying, modify, values, column, rows, specifi...",[dplyr],"[dplyr, dataframe, tidyr, split]"
47063,22428997,"[struggling, hours, get, match, replace, gsub,...","[regex, gsub]","[regex, string, gsub, stringr]"
133893,38207390,"[want, make, curved, text, around, ggplot, coo...","[dataframe, ggplot2]","[ggplot2, plot, bar-chart, graph]"
78189,29163079,"[trouble, functions, tried, vectorize, functio...",[integrate],"[integration, function, package, bitbucket]"
25376,15945396,"[simple, dataframe, two, vectors, speed, id, l...",[subset],"[dataframe, apply, dplyr, plyr]"
