In [204]:
from __future__ import division
import nltk 
import re
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import lxml.html
from nltk.corpus import stopwords
from nltk.stem import *
from datetime import datetime
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [217]:
# Function to parse xml data

def get_questions_tags(root):
    questions = []
    tags = []
    code = []
    titles = []
    users = []
    ques_with_title = []
    for row in root.findall('row'):
        post = row.get("PostTypeId")
        user = row.get("OwnerUserId")
        post_type = BeautifulSoup(post, "lxml")
        if post_type.get_text() == "1" and user is not None:
            
            users.append(user)
            
            #Get the Questions
            body = row.get("Body")
            soup = BeautifulSoup(body, "lxml")
            [s.extract() for s in soup('code')] 
            question_s = soup.get_text()
#             q_set = nltk.word_tokenize(question_s)
#             q_set = question_s.split()
#             print(q_set)
            question = nltk.Text(question_s)
# #             print(question)
            questions.append(question)
            
            #Get the Tags
            tag_list = row.get("Tags")
            tag_str = re.sub('[<>]', ' ', tag_list)
#             tag_set = nltk.word_tokenize(tag_str)
            tag_set = tag_str.split()
            tag_text = nltk.Text(tag_set)
            tags.append(tag_set)

            #Get the Titles
            title_s = row.get("Title")
#             t_set = nltk.word_tokenize(title_s)
#             t_set = title_s.split()
#             print(title_s)
            title = nltk.Text(title_s)
            titles.append(title)
            
            #Get Title with First 400 and last 100 words from question body
            
            if (len(question_s) > 500):
                question_s = question_s[:400] + question_s[-100:]
            q_with_t = title_s + " " + question_s
#             qt_set = nltk.word_tokenize(q_with_t)
            qt_set = q_with_t.split()
            ques_title = nltk.Text(qt_set)
            ques_with_title.append(ques_title)

#     print(len(ques_with_title))
    return questions, tags, titles, ques_with_title, users

In [163]:
# Function to remove numbers and special characters from question

def number_removal(ques_list):
    q_with_t_list = []
    for ques in ques_list:
        q_t_text = ""
        for word in ques:
            characters = [".", ",", ":", "(", ")", "[", "]", "{", "}", "?", "'"]
            q_text = ''.join([i for i in word if not (i.isdigit() or [e for e in characters if e in i])])
            if q_text != '':
                q_t_text += q_text + " "
        qt_set = q_t_text.split()
#         qt_set = nltk.word_tokenize(q_t_text)
        title_ques = nltk.Text(qt_set)
        q_with_t_list.append(title_ques)
    return q_with_t_list

In [164]:
# Function to convert to lowercase and remove stopwords from the question body

def remove_stopwords(q_list):
    q_with_t_list = []
    for text in q_list:
        stopwords = nltk.corpus.stopwords.words('english')
        st = ""
        for w in text:
            if w.lower() not in stopwords:
                st += w.lower() + " "
        w_set = st.split()
#         w_set = nltk.word_tokenize(st)
#         ques_body = nltk.Text(st)
        q_with_t_list.append(st)
    return q_with_t_list

In [165]:
# Function to perform stemming and converting each word in the question to its root word

def stemming(q_list):
    stemmer = PorterStemmer()
    post = []
    for q in q_list:
        st = ""
        for word in q:
            st += stemmer.stem(word) + " "
#         w_set = nltk.word_tokenize(st)
        w_set = st.split()
#         ques_body = nltk.Text(w_set)
        post.append(w_set)
    return post

In [151]:
startTime0 = datetime.now()
s = preprocessor('Posts_small.xml')
print("Time elapsed in Pre-Processing: " + str(datetime.now() - startTime0))

Time elapsed in Pre-Processing: 0:00:00.616662


In [218]:
# Preprocessor component - Tokenisation, Number removal, Stop-word removal, Stemming

def preprocessor(filename):
    root = ET.parse(filename).getroot()
    questions, tags, titles, ques_with_title, users = get_questions_tags(root)
    ques_with_title_list = number_removal(ques_with_title)
#     print(ques_with_title_list)
    title_ques = remove_stopwords(ques_with_title_list)
#     print(title_ques)
#     posts = stemming(title_ques)
    frame = [titles, questions, ques_with_title, title_ques, tags, users]
    return frame

In [197]:
# Function to return n most frequent tags from the training set

startTime0 = datetime.now()
def top_tags(rawTags, num):
    tagset = {}
    sortedTags = []
    for tags in rawTags: 
        for tag in tags:
            if tag not in tagset:
                tagset[tag] = 1
            else:
                tagset[tag] += 1
    print("Total no. of unique tags in data: ", len(tagset))
    sortedTags1 = sorted(tagset.items(), key=lambda item: item[1], reverse = True)
    for tag in sortedTags1[0:num]:
        sortedTags.append(tag[0])
    
    return sortedTags

# Function to add Tag-vector to the Pre-Processed Data (Custom multiLabelBinarizer)

def add_tag_mapping(preProcessedInput, mappingSize):
    tagMap = []
    rankedTags = top_tags(preProcessedInput[4], mappingSize)
    
    for tags in preProcessedInput[4]:
        tagVector = []
#         tagVector = [0]*mappingSize
        for tag in tags:
            if tag in rankedTags:
                tagVector.append(tag)

        tagMap.append(tagVector)
    preProcessedInput.append(tagMap)
    return preProcessedInput


processedData = add_tag_mapping(preprocessor('train2.xml'), 20)
df = pd.DataFrame({'titles' : processedData[0],
                      'questions' : processedData[1],
                      'ques_with_title' : processedData[2],
                      'posts' : processedData[3],
                      'tags' : processedData[4],
                      'users' : processedData[5],
                     'tag_vector' : processedData[6] })


print("Time elapsed in PreProcessing: " + str(datetime.now() - startTime0))

Total no. of unique tags in data:  5369
Time elapsed in PreProcessing: 0:00:35.063751


In [188]:
df["posts"]

0        applying opacity form use decimal double value...
1        percentage width child element absolutely posi...
2        calculate age c# givenarepresentingapersonsbir...
3        calculate relative time c# givenaspecificvalue...
4        determine users timezone isthereanystandardway...
5        difference mathfloor mathtruncate whatisthedif...
6        filling dataset datatable linq query result se...
7        binary data mysql howdoistorebinarydatainmysql...
8        fastest way get value π solutionsarewelcomeina...
9        throw error mysql trigger ifihaveaonatablehowc...
10       use c socket api c++ z/os ivebeenhavingissuesg...
11       check changes sql server table howcanimonitora...
12       reliable timer console application iamawaretha...
13       best way allow plugins php application iamstar...
14       multiple submit buttons html form letssayyoucr...
15       get distinct ordered list names datatable usin...
16       paging collection linq howdoyoupagethroughacol.

In [202]:
#  Large Sample run

startTime01 = datetime.now()
processedData = add_tag_mapping(preprocessor('sample4.xml'), 20)
df = pd.DataFrame({'titles' : processedData[0],
                      'questions' : processedData[1],
                      'ques_with_title' : processedData[2],
                      'posts' : processedData[3],
                      'tags' : processedData[4],
                      'users' : processedData[5],
                     'tag_vector' : processedData[6] })

print("Time elapsed in PreProcessing: " + str(datetime.now() - startTime01))

startTime0 = datetime.now()
npx=np.array(df["posts"])
npy=np.array(df["tag_vector"])
# tv = TfidfVectorizer() # can play around with min_df/max_df here

x_train, x_test, y_train, y_test = train_test_split(npx, npy, test_size=0.2)

# x_traincv = cv.fit_transform(["Hi How are you How are you doing","Hi what's up","Hey there"])
# x_traincv.toarray()
# cv.get_feature_names()

# x_traintv=tv.fit_transform(x_train)

# a=x_traintv.toarray()
# print(tv.inverse_transform(a[0]))
# # print(x_train.iloc[0])
# print(tv.get_feature_names())

# x_testtv=tv.transform(x_test)

print("Training Data size: ", len(x_train))

mlb = MultiLabelBinarizer()
Y_train = mlb.fit_transform(y_train)
# print("y_train is: ", Y_train)


startTime1 = datetime.now()
classifier1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LinearSVC(loss='hinge', tol = .001)))])

classifier1.fit(x_train, Y_train)
predicted1 = classifier1.predict(x_test)
print("Time elapsed in LinearSVC: ", str(datetime.now() - startTime1))


# startTime2 = datetime.now()
# classifier2 = Pipeline([
#     ('vectorizer', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', OneVsRestClassifier(svm.SVC(kernel='linear', C=1, gamma=1)))])

# classifier2.fit(x_train, Y_train)
# predicted2 = classifier2.predict(x_test)
# print("Time elapsed in Linear svm: ", str(datetime.now() - startTime2))


startTime3 = datetime.now()
classifier3 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(SGDClassifier(loss='hinge', max_iter=200)))])

classifier3.fit(x_train, Y_train)
predicted3 = classifier3.predict(x_test)
print("Time elapsed in SGDClassifier: ", str(datetime.now() - startTime3))


# all_labels = mlb.inverse_transform(predicted)
# for item, labels in zip(x_test, all_labels):
#     print('{0} => {1}'.format(item, ', '.join(labels)))

# model = svm.SVC(kernel='linear', C=1, gamma=1)
# print(y_train.to_string(index=False))
# mnb = MultinomialNB()
# mnb.fit(x_traincv,y_train)

print("\nAccuracy for LinearSVC: ", accuracy_score(predicted1, mlb.fit_transform(y_test)))
print("F1 for LinearSVC: ", f1_score(predicted1, mlb.fit_transform(y_test), average='micro'))

# print("\nAccuracy for Linear SVM: ", accuracy_score(predicted2, mlb.fit_transform(y_test)))
# print("F1 for Linear SVM: ", f1_score(predicted2, mlb.fit_transform(y_test), average='micro'))

print("\nAccuracy for SGD: ", accuracy_score(predicted3, mlb.fit_transform(y_test)))
print("F1 for SGD: ", f1_score(predicted3, mlb.fit_transform(y_test), average='micro'))

print("Time elapsed in this step: " + str(datetime.now() - startTime0))
# model.fit(x_traintv,Y_train)
# clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=1, gamma=1))
# y_score = clf.fit(x_train, y_train).decision_function(x_test)

Total no. of unique tags in data:  12137
Time elapsed in PreProcessing: 0:03:23.659170
Training Data size:  79193
Time elapsed in LinearSVC:  0:00:13.731251
Time elapsed in SGDClassifier:  0:01:17.233693

Accuracy for LinearSVC:  0.568160008081
F1 for LinearSVC:  0.557865892972

Accuracy for SGD:  0.515379564624
F1 for SGD:  0.432143679484
Time elapsed in this step: 0:01:31.374044


In [216]:
all_labels = mlb.inverse_transform(predicted1)
for item, labels in zip(x_test, all_labels):
    print('{0} => {1}'.format(item, ', '.join(labels)))


two different drop menus depend parent menu two drop menus want populated identical data depending selected parent drop menu right using javascript library populates one child drop menu based parent need two drop menus populated simultaneously javascript library contains function called printoptions supposed populate dropdown menu something selected parent menu tried calling function twice one drop menu doesnt seem working got library http//wwwjavascripttoolboxcom/lib/dynamicoptionlist/documentationphp  => javascript
handle autolinking wiki page content mean autolinking process wiki links inlined page content generated either hyperlink page exist create link page doesnt exist parser using two step process - first page content parsed links wiki pages source markup extracted feed array existing pages back parser final html markup generated best way handle process seems need keep cached list every single page site rather extract index page titles time better check link separately see exis

In [220]:
# Smaller Sample run:

startTime01 = datetime.now()
processedData = add_tag_mapping(preprocessor('train2.xml'), 20)
df = pd.DataFrame({'titles' : processedData[0],
                      'questions' : processedData[1],
                      'ques_with_title' : processedData[2],
                      'posts' : processedData[3],
                      'tags' : processedData[4],
                      'users' : processedData[5],
                     'tag_vector' : processedData[6] })

print("Time elapsed in PreProcessing: " + str(datetime.now() - startTime01))

startTime0 = datetime.now()
npx=np.array(df["posts"])
npy=np.array(df["tag_vector"])
# tv = TfidfVectorizer() # can play around with min_df/max_df here

x_train, x_test, y_train, y_test = train_test_split(npx, npy, test_size=0.2)

# x_traincv = cv.fit_transform(["Hi How are you How are you doing","Hi what's up","Hey there"])
# x_traincv.toarray()
# cv.get_feature_names()

# x_traintv=tv.fit_transform(x_train)

# a=x_traintv.toarray()
# print(tv.inverse_transform(a[0]))
# # print(x_train.iloc[0])
# print(tv.get_feature_names())

# x_testtv=tv.transform(x_test)

print("Training Data size: ", len(x_train))

mlb = MultiLabelBinarizer()
Y_train = mlb.fit_transform(y_train)
# print("y_train is: ", Y_train)


startTime1 = datetime.now()
classifier1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LinearSVC(loss='hinge', tol = .001)))])

classifier1.fit(x_train, Y_train)
predicted1 = classifier1.predict(x_test)
print("Time elapsed in LinearSVC: ", str(datetime.now() - startTime1))


startTime2 = datetime.now()
classifier2 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(svm.SVC(kernel='linear')))])

classifier2.fit(x_train, Y_train)
predicted2 = classifier2.predict(x_test)
print("Time elapsed in Linear svm: ", str(datetime.now() - startTime2))


startTime3 = datetime.now()
classifier3 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(SGDClassifier(loss='hinge', max_iter=200)))])

classifier3.fit(x_train, Y_train)
predicted3 = classifier3.predict(x_test)
print("Time elapsed in SGDClassifier: ", str(datetime.now() - startTime3))


# all_labels = mlb.inverse_transform(predicted)
# for item, labels in zip(x_test, all_labels):
#     print('{0} => {1}'.format(item, ', '.join(labels)))

# model = svm.SVC(kernel='linear', C=1, gamma=1)
# print(y_train.to_string(index=False))
# mnb = MultinomialNB()
# mnb.fit(x_traincv,y_train)

print("\nAccuracy for LinearSVC: ", accuracy_score(predicted1, mlb.fit_transform(y_test)))
print("F1 for LinearSVC: ", f1_score(predicted1, mlb.fit_transform(y_test), average='micro'))

print("\nAccuracy for Linear SVM: ", accuracy_score(predicted2, mlb.fit_transform(y_test)))
print("F1 for Linear SVM: ", f1_score(predicted2, mlb.fit_transform(y_test), average='micro'))

print("\nAccuracy for SGD: ", accuracy_score(predicted3, mlb.fit_transform(y_test)))
print("F1 for SGD: ", f1_score(predicted3, mlb.fit_transform(y_test), average='micro'))

print("Time elapsed in this step: " + str(datetime.now() - startTime0))
# model.fit(x_traintv,Y_train)
# clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=1, gamma=1))
# y_score = clf.fit(x_train, y_train).decision_function(x_test)

Total no. of unique tags in data:  5369
Time elapsed in PreProcessing: 0:00:34.852773
Training Data size:  14108
Time elapsed in LinearSVC:  0:00:01.367557
Time elapsed in Linear svm:  0:03:47.247665
Time elapsed in SGDClassifier:  0:00:06.261642

Accuracy for LinearSVC:  0.517857142857
F1 for LinearSVC:  0.490791896869

Accuracy for Linear SVM:  0.517857142857
F1 for Linear SVM:  0.490904904444

Accuracy for SGD:  0.510204081633
F1 for SGD:  0.46986236355
Time elapsed in this step: 0:03:54.972930


In [214]:
all_labels = mlb.inverse_transform(predicted1)
for item, labels in zip(x_test, all_labels):
    print('{0} => {1}'.format(item, ', '.join(labels)))


(19799,)


In [208]:
# Compute Precision-Recall and plot curve

f = open("out.txt",'w')
n_classes = Y_train.shape[1]
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    print(y_test[:,i], file=f)
    print(y_score[:,i], file=f)
    precision[i], recall[i], _ = precision_recall_curve(Y_test[:, i],
                                                        y_score[:, i])
    #print "here"
    print(precision[i], file=f)
    average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i])
    print(average_precision[i], file=f)


# Plot Precision-Recall curve
plt.clf()
plt.plot(recall[0], precision[0], label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision[0]))
plt.legend(loc="lower left")
plt.show()


IndexError: too many indices for array

In [14]:
# Set of all possible tags

def set_of_tags(tag_column):
    tags = []
    for row in tag_column:
        for t in row:
            if t not in tags:
                tags.append(t)
    return tags