In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data_rake.csv', encoding='ISO-8859-1')

df = df[['Title', 'Body', 'Tags']]

df.iloc[0]

Title             calling twain driver 64 bite application
Body     advices 3rd party vendors provide 64 bite twai...
Tags                                         c# .net twain
Name: 0, dtype: object

In [3]:
title = df['Title'].tolist()
body = df['Body'].tolist()
tag = df['Tags'].tolist()

In [17]:
questions = [str(t) + str(b) for t, b in zip(title, body)]

In [18]:
questions = questions[:10000]

In [21]:
import nltk
from nltk.corpus import stopwords

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

def normalise(word, lemmatizer):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    # word = stemmer.stem_word(word) #if we consider stemmer then results comes with stemmed word, but in this case word will not match with comment
    word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word, stopwords):
    """Checks conditions for acceptable word: length, stopword. We can increase the length if we want to consider large phrase"""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted


def get_terms(tree, stopwords, lemmatizer):
    for leaf in leaves(tree):
        term = [ normalise(w, lemmatizer) for w,t in leaf if acceptable_word(w, stopwords) ]
        yield term
        
def extract_noun_phrase(text):
    sentence_re = r'(?:(?:[A-Z])(?:.[A-Z])+.?)|(?:\w+(?:-\w+)*)|(?:\$?\d+(?:.\d+)?%?)|(?:...|)(?:[][.,;"\'?():-_`])'
    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
    """
    chunker = nltk.RegexpParser(grammar)
    toks = nltk.regexp_tokenize(text, sentence_re)
    postoks = nltk.tag.pos_tag(toks)

    tree = chunker.parse(postoks)
    stop_words = stopwords.words('english')

    terms = get_terms(tree, stop_words, lemmatizer)
    
    return terms

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import networkx as nx
from itertools import combinations

def vectorize_keyphrase_candidates(keyphrase_candidates):
    n = len(keyphrase_candidates) # the number of candidates
    vocabulary = set()
    for cand in keyphrase_candidates:
        for word in cand:
            vocabulary.add(word)
    vocabulary = list(vocabulary)
    
    X = np.zeros((n, m))
    for i, k in enumerate(keyphrase_candidates):
        for w in keyphrase_candidates[k]:
            X[i][vocabulary.index(w)] += 1
            
    keyphrase_candidates = [' '.join(cand) for cand in keyphrase_candidates]
    return X, keyphrase_candidates

def topic_clustering(keyphrase_candidates, X, threshold, method):
    Y = pdist(X, 'jaccard')
    Z = linkage(Y, method=method)
    clusters = fcluster(Z, t=threshold, criterion='distance')
    
    topics = []
    cand_to_topic = dict()
    
    for cluster_id in range(1, max(clusters) + 1):
        topics.append([keyphrase_candidates[j] for j in range(len(clusters)) if clusters[j] == cluster_id])
        
    for i, cluster_id in enumerate(clusters):
        cand_to_topic[keyphrase_candidates[i]] = cluster_id
        
    return topics, cand_to_topic

def build_graph(keyphrase_candidates, topics, cand_to_topic):
    G = nx.graph()
    
    G.add_nodes_from(keyphrase_candidates)
    
    for node_i, node_j in combinations(keyphrase_candidates, 2):
        # discard intra-topic edges
        if cand_to_topic[node_i] == cand_to_topic[node_j]:
            continue
            
        weights = []
        for p_i in self.candidates[node_i].offsets:
            for p_j in self.candidates[node_j].offsets:

                # compute gap
                gap = abs(p_i - p_j)

                # alter gap according to candidate length
                if p_i < p_j:
                    gap -= len(self.candidates[node_i].lexical_form) - 1
                if p_j < p_i:
                    gap -= len(self.candidates[node_j].lexical_form) - 1

                weights.append(1.0 / gap)

        # add weighted edges 
        if weights:
            # node_i -> node_j
            G.add_edge(node_i, node_j, weight=sum(weights))
            # node_j -> node_i
            G.add_edge(node_j, node_i, weight=sum(weights))

In [22]:
terms = extract_noun_phrase(questions[0])
for term in terms:
    print(term)

['twain']
['bite', 'applicationadvices']
['party', 'vendor']
['bite', 'twain', 'implementation']
['twain', 'api', 'net', 'application', 'compile', 'x64', 'target', 'platform']


In [23]:
questions[0]

'calling twain driver 64 bite applicationadvices 3rd party vendors provide 64 bite twain implementations need call twain api net c application compile x64 target platform'

______

In [19]:
# Reference: https://github.com/boudinfl/ pke

import pke
import string
from nltk.corpus import stopwords

def extract_keyphrases(questions):
    results = []
        
    for question in questions:
        try:
            extractor = pke.unsupervised.MultipartiteRank()
            extractor.load_document(input=question)
            pos = {'NOUN', 'PROPN', 'ADJ'}
    #         pos = {'NOUN'}
            stoplist = list(string.punctuation)
            stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
            stoplist += stopwords.words('english')
            extractor.candidate_selection(pos=pos, stoplist=stoplist)

            extractor.candidate_weighting(alpha=1.1,
                                                  threshold=0.74,
                                                  method='average')

            keyphrases = extractor.get_n_best(n=3)

            results.append([k for (k, score) in keyphrases])
        except Exception as e:
            results.append([])
        
    return results

In [20]:
results = extract_keyphrases(questions)



In [12]:
print(results[:10])

[['bite applicationadvices', '3rd party vendors', 'twain api net'], ['variable', 'struct', 'define managerh file instantiate managerc'], ['val jcropapisetselect', 'click function', 'input text field problem'], ['workfirst name label', 'month php echo row', 'input name bfname'], ['descend order thisnamecompareto sgetname return', 'ways', 'sort list edge name'], ['multiple definitions function error', 'first define', 'ang3'], ['accentuate letter column webservice', 'call webservice j2me application exception', 'select columns'], ['url http abccomradiofrmypersonaspx', 'jquery change replace urlprinciple script', 'abccomradiofrintranetlayoutsrfportalwebpersonaspx accountnameradiofradministrateur'], ['cv mat', 'dimension matrix use svdtry', 'simple source code example'], ['auto', 'value return ajax call', 'textbox restrict user']]


In [15]:
tags = [str(t).split() for t in tag]
print(tags[:10])

[['c#', '.net', 'twain'], ['c', 'struct'], ['javascript', 'jquery', 'html'], ['javascript', 'php', 'html'], ['java'], ['c++', 'object', 'makefile'], ['php', 'mysql', 'web-services', 'java-me', 'wsdl'], ['jquery', 'replace'], ['c++', 'opencv', 'matrix', 'dimensionality-reduction'], ['javascript', 'ajax', 'jquery', 'autocomplete', 'autosuggest']]


In [22]:
precisions = []
recalls = []

for i in range(len(results)):
    if len(results[i]) > 0:
        num_same = 0
        for phrase in results[i]:
            num_same += len(set(phrase.split()) & set(tags[i]))
        precisions.append(num_same / float(len(results[i])))
        recalls.append(num_same / float(len(tags[i])))
    
precision = np.mean(precisions)
recall = np.mean(recalls)
f1 = 2 * precision * recall / (precision + recall)

print("Precision: ", precision)
print("Recall: ", recall)
print(f1)

Precision:  0.15128719491808756
Recall:  0.15093614175860914
0.1511114644521724
