In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import io
stopwords = nltk.corpus.stopwords.words('english')
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
from collections import defaultdict
rankings = defaultdict(dict)
import operator
# making  necessary imports


In [2]:
class Papers:                                   # Class for paper
    def __init__(self,contents,author,index,is_expert):
        self.author = author
        self.index = index
        self.expert = is_expert
        self.contents =contents
    def assignindex (self,index):
        self.index = index

In [3]:
class Experts:                               # Class for Expert 
    def __init__(self,paper,name):
        self.name = name
        self.papers = [paper]

In [4]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word not in stopwords]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [5]:
def get_tfid(contents):                 # function to return Tfidf matrix of all papers         
    tfidf_vectorizer = TfidfVectorizer(max_df=0.93, max_features=20000,
                                     min_df=0.05, stop_words='english',lowercase = True,
                                     use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(contents) #fit the vectorizer to contents of papers
    print(tfidf_matrix.shape)
    return tfidf_matrix

In [6]:
def Kmeans (n_clusters,tfidf_matrix):                 # clusters Tfidf matrix
    km = KMeans(n_clusters=n_clusters)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    return clusters

In [7]:
def getfun(sim,expert,items):
    a = [sim[items[0],paper.index]for paper in expert.papers]
    if(a ==[]):
        return 0
    else:
        return max(a)
def semi_supervisedLearning(lables,sim,no_of_clusters,Experts):
    i=0
    indexed_lables = []
    for l in lables:
        indexed_lables.append((i,l))
        i=i+1
    for cluster in range(0,no_of_clusters):
        for expert in Experts:

            rankings[cluster][expert.name] = 0
    for cluster in range(0,no_of_clusters):
        for expert in Experts:
            for items in list(filter((lambda x : x[1] == cluster),indexed_lables)) :
                    rankings[cluster][expert.name] = rankings[cluster][expert.name] + (getfun(sim,expert,items))
    return rankings
                

In [8]:
file = open('evaluation/short_listed_revs.txt','r')                #contains  expert names
expert_names =file.readlines()                                  
reviewers_dict ={}                                   # a dictionary of expert object indexed by names


In [9]:
for author in expert_names:                                 
    author=(author.strip())                        # creating expert objects and adding then to dictionary
    with io.open('title_abstract_data/'+ author +'.txt','r',encoding='utf-8',errors='ignore') as infile, \
    io.open('temp.txt','w',encoding='utf-8',errors='ignore') as outfile:
        for line in infile:
            print(*line.split(), file=outfile)
    file1 = open("temp.txt","r")
    for eachline in file1.readlines():
        p1 = Papers(eachline,author,-1,True)
        reviewer = reviewers_dict.get(author)
        if(reviewer != None):
            reviewer.papers.append(p1)
        else:
            e1 = Experts(p1,author)
            reviewers_dict[author] = e1


In [10]:
file = open("evaluation/titles.txt", "r")               # now for papers for which we neeed recommendation
lines =file.readlines()

with io.open('evaluation/abstracts.txt','r',encoding='utf-8',errors='ignore') as infile, \
     io.open('temp.txt','w',encoding='utf-8',errors='ignore') as outfile:
    for line in infile:
        print(*line.split(), file=outfile)
file1 = open("temp.txt","r")
contents = file1.readlines()
research_papers =[]    #list of paper objects for which we need recommendation


In [11]:
for l, c in zip(lines, contents):        #adding to research paper list
    l=l.strip()
    content = str(l) + str(contents)
    p1 = Papers(content,-1,-1,False)
    research_papers.append(p1)

In [16]:
i=0
expertlist = list(reviewers_dict.values())       # assigning unique index to each paper
all_paper=[]
all_paper_objects=[]
for expert in expertlist:
    for paper in expert.papers:
        paper.assignindex(i)
        all_paper.append(paper.contents)
        all_paper_objects.append(paper)
        i=i+1

In [18]:
for paper in research_papers:
        paper.assignindex(i)
        all_paper.append(paper.contents)
        all_paper_objects.append(paper)
        i=i+1

In [20]:
print (len(all_paper))
print (len(all_paper_objects))

20754
20754


In [22]:
tfidf_mat = get_tfid(all_paper) 

(20754, 125)


In [23]:
similarity=(cosine_similarity(tfidf_mat));       # generating similarity matrix

In [24]:
labels = Kmeans(3,tfidf_mat)   

In [None]:
scores = semi_supervisedLearning(labels,similarity,3,expertlist)

In [None]:
cl1_recommend = sorted(scores[0].items(), key=operator.itemgetter(1) , reverse = True)
cl2_recommend = sorted(scores[1].items(), key=operator.itemgetter(1), reverse = True)
cl3_recommend = sorted(scores[2].items(), key=operator.itemgetter(1), reverse = True)

In [None]:
print (cl1_recommend[0:10])


In [None]:
print (cl2_recommend[0:10])

In [None]:
print (cl3_recommend[0:10])