In [1]:
# Reference used: https://www.geeksforgeeks.org/tf-idf-for-bigrams-trigrams/
# import required packages
import nltk 
import re 
import math
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from sklearn.linear_model import Ridge, LinearRegression
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.util import ngrams
import pandas as pd   
import jsonlines
import json
from tqdm.notebook import tqdm
import pickle
from scipy.sparse import csr_matrix
from difflib import SequenceMatcher
from heapq import nlargest as _nlargest

In [2]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [3]:
# Download nltk requirements
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAGHAV\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAGHAV\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
documents = [] #stores a list of dictionaries, each dictionary holds all information about a professor
prof_names = [] #stores a list of professor names
docID = {} # stores an document index to google scholar id mapping
userIndex = {} # stores a google scholar id to document index mapping
term_total_citations = {} 
author_term_citations = {}
idx = 0
with jsonlines.open('data_india_sample.jl') as reader:
    for obj in tqdm(reader, leave=False):
        if obj['user'] not in userIndex:
            documents.append(obj)
            prof_names.append(obj['name'])
            docID[idx] = obj['user']
            userIndex[obj['user']] = idx
            author_term_citations[obj['user']] = {}
            idx += 1

0it [00:00, ?it/s]

In [5]:
# Code Taken From: https://www.geeksforgeeks.org/tf-idf-for-bigrams-trigrams/
# Remove special characters from a string
def remove_string_special_characters(s):
    # removes special characters with ' '
    stripped = re.sub('[^a-zA-z\s]', ' ', s)
#     print(stripped)
    stripped = re.sub('_', ' ', stripped)
      
    # Change any white space to one space
    stripped = re.sub('\s+', ' ', stripped)
      
    # Remove start and end white spaces
    stripped = stripped.strip()
    if stripped != '':
            return stripped.lower()

In [6]:
# Code Reference: https://www.geeksforgeeks.org/python-stemming-words-with-nltk/
# Converts all words to their stem/root form
def stem_string(sentence):
    ps = PorterStemmer()
    
    words = word_tokenize(sentence)
    words = [ps.stem(word) for word in words]
    return " ".join(words)

In [7]:
# Removes stop words 
def remove_stop_words(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

In [8]:
# Processes the string
def process_string(sentence):
    sentence = remove_string_special_characters(sentence)
    sentence = stem_string(sentence)
    sentence = remove_stop_words(sentence)
    return sentence

In [9]:
# updates the no. of citations of a term for a professor as well as the total no. of citations of the term
def update_term_citations(document, sentence, numCitations, userID):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
    tokens = [token for token in sentence.split(" ") if token != ""]
    for n in range(1, 5):
        seen_ngms = []
        ngms = list(ngrams(tokens, n))
        for ngm in ngms:
            gm = " ".join(ngm)
            if gm in seen_ngms:
                continue
            seen_ngms.append(gm)
            if gm in author_term_citations[userID]:
                author_term_citations[userID][gm] += numCitations
            else:
                author_term_citations[userID][gm] = numCitations
            if gm in term_total_citations:
                term_total_citations[gm] += numCitations
            else:
                term_total_citations[gm] = numCitations + 1

In [10]:
word_documents = []
for document in tqdm(documents, leave=False):
    sentences = []
    for subject in document['subjects']:
        sentence = process_string(subject)
        sentences.append(sentence)
        # update term citations using n-grams generated from subject of expertise
        update_term_citations(document, sentence, int(document['citationsAll']), document['user'])
    for paper in document['papers']:
        try:
            sentence = paper['title'] + " " + paper['conference']
            sentence = process_string(sentence)
        except: 
            continue
        # update term citations using n-grams generated by combining paper name and conference
        update_term_citations(document, sentence, paper['citations'], document['user'])
        sentences.append(sentence)

    word_documents.append(".\n".join(sentences))

  0%|          | 0/994 [00:00<?, ?it/s]

## TF-IDF Matrix

In [11]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,4), analyzer='word')
tfidf_matrix = vectorizer.fit_transform(word_documents) 
features = (vectorizer.get_feature_names())
idf_scores = vectorizer.idf_

cnt_vct = CountVectorizer(stop_words='english', ngram_range=(1,4), analyzer='word')
cnt_matrix = cnt_vct.fit_transform(word_documents)
cnt_features = cnt_vct.get_feature_names()

idf_scores = csr_matrix(idf_scores)

# tfidf_new_matrix is the required matrix
tfidf_new_matrix = csr_matrix.copy(csr_matrix.astype(cnt_matrix, dtype=np.float64))
for row in tqdm(range(cnt_matrix.shape[0]), leave=False):
    tfidf_new_matrix[row] = tfidf_new_matrix[row].multiply(idf_scores)

  0%|          | 0/994 [00:00<?, ?it/s]

### Modified TF-IDF, Multiply the Citations Log Factor

In [12]:
# Scaling the original tf-idf weights based on subjects of expertise and citation counts of papers
feature_idx = {}
for idx, feature in enumerate(cnt_features):
    feature_idx[feature] = idx

for author, author_dict in tqdm(author_term_citations.items(), leave=False):
    for term, term_citations in author_dict.items():
        if term in feature_idx:
            term_idx = feature_idx[term]
            author_idx = userIndex[author]
            if tfidf_new_matrix[author_idx, term_idx]:
                log_factor = math.log( 1 + (term_citations / term_total_citations[term]) , 2)
                tfidf_new_matrix[author_idx, term_idx] *= log_factor

  0%|          | 0/994 [00:00<?, ?it/s]

# Activeness 

In [13]:
maxrng=20
minyr,maxyr=2001,2020

# mat_activeness is the required matrix
# rows denote the professors 
# column1 stores total citations on papers in year 2020, 
# column2 stores total citations on papers in year 2020 and 2019, and so on ...
mat_activeness = np.zeros((len(documents),maxrng))

ind=0
for document in tqdm(documents, leave=False):
    papers=document['papers']
    for paper in papers:
        paperyr=paper['year']
        if paperyr>=minyr and paperyr<=maxyr:
            dis=maxyr-paperyr
            mat_activeness[ind,dis]+=paper['citations']
    ind+=1

ind=0
for i in range(mat_activeness.shape[0]):
    for j in range(mat_activeness.shape[1]):
        if j>0:
            mat_activeness[i,j]+=mat_activeness[i,j-1]
            

  0%|          | 0/994 [00:00<?, ?it/s]

## Slope, Hindex, Institute Reputation

In [14]:

# Calculate slope given a training set using Linear Regression 
def slope_calc(x,y):
    x,y = np.array(x),np.array(y)
    n=y.size
    clf = LinearRegression(fit_intercept=False)
    x = np.concatenate([np.ones((n,1)),x.reshape(-1,1)],axis=1)
#     print(xinds)
    clf.fit(x,y)
    return clf.coef_[1]

# Returns the ranks of each institute in the list
def get_insti_ranks():
    files = ["india_institutes.txt", "britain_institutes.txt", "america_institutes.txt"]
    insti_score = dict()
    for file in files:
        path = "web_data/"+file
        with open(path,"r") as f:
            insti = f.readlines()
            instinew=[]
            for institute in insti:
                instinew.append(institute.strip())
            insti = list(filter(lambda x:x!='\n',instinew))
            n = len(insti)
            maxrank, minrank = 100,1
            diff = (maxrank-minrank)/(n-1)
            cur=maxrank
            for institute in insti:
                insti_score[institute] = cur
                cur-=diff
    return insti_score


In [15]:
# ranking_metrics is the required matrix
# # rows denote the professors 
# column1 contains the slope of citations of the professor
# column2 contains the h-index of the professor
# column1 contains the institute reputation of the professor
ranking_metrics = np.zeros((len(documents),3))

# Compute and store slope of citations in ranking_metrics
ind=0
for document in tqdm(documents, leave=False):
    cits=[]
    years=[]
    for entry in document['yearCitations']:
        dictx = json.loads(entry)
        if dictx['year']<=2020:
            cits.append(dictx['citations'])
            years.append(dictx['year'])
    if len(years)<=5:
        ranking_metrics[ind,0]=0
    else:    
        ranking_metrics[ind,0]=slope_calc(years,cits)
    ind+=1           
min_slope =  np.min(ranking_metrics[:,0])
if min_slope<0:
    for i in range(ranking_metrics.shape[0]):
        ranking_metrics[i,0]+=min_slope
    
# Compute and store h-index in ranking_metrics
ind=0
for document in tqdm(documents, leave=False):
    ranking_metrics[ind,1]=document["h-indexAll"]
    ind+=1
                                                                        
# Compute and store institute reputation in ranking_metrics
insti_ranks = get_insti_ranks()
ind=0
for document in tqdm(documents, leave=False):
    ranking_metrics[ind,2] = insti_ranks[document['institute']]
    ind+=1                               

  0%|          | 0/994 [00:00<?, ?it/s]

  0%|          | 0/994 [00:00<?, ?it/s]

  0%|          | 0/994 [00:00<?, ?it/s]

## Pickle the Matrix and Vectorizer 
#### (storing the computed matrices for efficient computation while querying)

In [16]:
pickle.dump(cnt_vct, open("web_data/count_vectorizer.pkl", "wb"))
pickle.dump(tfidf_new_matrix, open("web_data/tfidf_new_matrix.pkl", "wb"))
pickle.dump(docID, open("web_data/doc_id_dict.pkl", "wb"))
pickle.dump(mat_activeness, open("web_data/mat_activeness.pkl", "wb"))
pickle.dump(ranking_metrics, open("web_data/ranking_metrics.pkl", "wb"))
pickle.dump(documents, open("web_data/documents.pkl", "wb"))
pickle.dump(prof_names, open("web_data/prof_names.pkl", "wb"))

### Subject Query

In [17]:
# Load the stored matrices during time of querying
loaded_cnt_vct = pickle.load(open("web_data/count_vectorizer.pkl", "rb"))
loaded_tfidf_new_matrix = pickle.load(open("web_data/tfidf_new_matrix.pkl", "rb"))
loaded_docID = pickle.load(open("web_data/doc_id_dict.pkl", "rb"))
loaded_mat_activeness = pickle.load(open("web_data/mat_activeness.pkl", "rb"))
loaded_ranking_metrics = pickle.load(open("web_data/ranking_metrics.pkl", "rb"))
loaded_documents = pickle.load(open("web_data/documents.pkl", "rb"))
loaded_prof_names = pickle.load(open("web_data/prof_names.pkl", "rb"))

In [18]:
# Returns tf-idf score vector
def get_tf_idf_vec(query):
    query = process_string(query)
    cnt_matrix_query = loaded_cnt_vct.transform([query])
    cnt_matrix_query = csr_matrix.transpose(cnt_matrix_query)

    result = loaded_tfidf_new_matrix * cnt_matrix_query
    result = csr_matrix.transpose(result)
    result = result.toarray()[0]
    
    return result

# Returns activeness score vector
def get_active_vec(year):
    col = year-1
    citations = loaded_mat_activeness[:,col].squeeze()
    return citations

# Returns slope of citations score vector
def get_slope_vec():
    return loaded_ranking_metrics[:,0].squeeze()

# Returns h-index score vector
def get_hindex_vec():
    return loaded_ranking_metrics[:,1].squeeze()

# Returns institute reputation score vector
def get_insti_vec():
    return loaded_ranking_metrics[:,2].squeeze()

# Normalises vector
def normalise(x):
    return x/np.linalg.norm(x)

# Return result of a subject query
def query_subject(user_input):
    
    #parameters
    params = ['tfidf', 'active', 'slope', 'hindex', 'insti']
    ranks={}
    for param in params:
        ranks[param]={}
    
    #set weight factors
    ranks['tfidf']['wt'] = user_input['tfidf_score']
    ranks['active']['wt'] = user_input['active_score']
    ranks['hindex']['wt'] = user_input['hindex_score']
    ranks['slope']['wt'] = user_input['slope_score']
    ranks['insti']['wt'] = user_input['insti_score']
    
    #scale up 
    total_user_input_wt = 0
    for key,val in ranks.items():
        total_user_input_wt += val['wt']
    factor = 100/total_user_input_wt
    for key,val in ranks.items():
        val['wt'] *= factor
    
#     print(ranks)
    
    #tf-idf
    ranks['tfidf']['vec'] = normalise(get_tf_idf_vec(user_input['query_string']))
#     print(ranks['tfidf']['vec'].shape)
    
    #activeness
    ranks['active']['vec'] = normalise(get_active_vec(user_input['active_yr']))
#     print(ranks['active']['vec'].shape)
    
    #slope of citations
    ranks['slope']['vec'] = normalise(get_slope_vec())
    
    #h-index
    ranks['hindex']['vec'] = normalise(get_hindex_vec())
    
    #institute reputation
    ranks['insti']['vec'] = normalise(get_insti_vec())
    
#     print(ranks['active']['vec'])
    
    #final score
    final_sc = np.zeros(ranks['tfidf']['vec'].size)
    for key,val in ranks.items():
        final_sc += val['wt']*val['vec']
        
    indices = final_sc.argsort()
    indices = indices[::-1]
    profs = [(i,loaded_documents[i],loaded_docID[i],final_sc[i]) for i in indices]
    metrics = []
    for ind in indices:
        metrics.append([ind,
                        ranks['tfidf']['vec'][ind],
                        ranks['active']['vec'][ind],
                        ranks['slope']['vec'][ind],
                       ranks['hindex']['vec'][ind],
                        ranks['insti']['vec'][ind],
                       ])
        
    return profs, metrics

In [19]:
# query object
input_obj = dict()
input_obj['query_string'] = 'machine learning'
input_obj['active_yr'] = 5
input_obj['tfidf_score'] = 1
input_obj['active_score'] = 0
input_obj['hindex_score'] = 0
input_obj['slope_score'] = 0
input_obj['insti_score'] = 10

# Prints google scholar ids of top 100 professors for the subject query
profs, _ = query_subject(input_obj)
top_100=[]
cnt=0
while cnt<100:
    top_100.append(profs[cnt][2])
    cnt+=1
print(top_100)

['0MNT3h8AAAAJ', 'hPvt6d8AAAAJ', 'a2Wgu0EAAAAJ', 'HTVepekAAAAJ', '1TgzFksAAAAJ', '3RND6lUAAAAJ', 'inrwXiIAAAAJ', 'cVdB1iwAAAAJ', 'mJIUC0UAAAAJ', 'leKXwz4AAAAJ', '3gHKDo4AAAAJ', 'VhoipoUAAAAJ', '3jc0vZsAAAAJ', 'N2O8hI4AAAAJ', 'slAHcFIAAAAJ', 'bwZHQ8EAAAAJ', '2PDhWl0AAAAJ', '7k6tzDAAAAAJ', 'sIQDguEAAAAJ', 'mb1pZXIAAAAJ', 'cv7Xbu8AAAAJ', 'H85F5q8AAAAJ', 'h55vyUQAAAAJ', '7D4ts0YAAAAJ', 'rNeidT4AAAAJ', 'X5Bjw-wAAAAJ', 'Rln7jKAAAAAJ', 'zlyCnPsAAAAJ', 'stMEHeUAAAAJ', '8mGCr2oAAAAJ', 'qXQPkuEAAAAJ', 'DGNTCYEAAAAJ', 'enGoLCQAAAAJ', '5EtPkJoAAAAJ', '1y6Z-ckAAAAJ', 'B_oQnccAAAAJ', '-875CKoAAAAJ', '8g3TkDAAAAAJ', 'qCzk-pYAAAAJ', 'dIypbs4AAAAJ', 'YGAmZF4AAAAJ', 'K834eKgAAAAJ', 'ppEhO9YAAAAJ', 'verRbuoAAAAJ', 'S8cDuZ4AAAAJ', '11MDgP4AAAAJ', 'cWcqrncAAAAJ', 'qvckJGkAAAAJ', 'CU5RlLYAAAAJ', '0u7mV4cAAAAJ', 'U1H8JZEAAAAJ', '_TpoQ08AAAAJ', 'mEs0PmoAAAAJ', 'sphOH2IAAAAJ', '3WEQbgMAAAAJ', 'R7pbH9YAAAAJ', '_GVZlQ4AAAAJ', 'QJNq4-YAAAAJ', 'cTkda4oAAAAJ', 'zYs3TaAAAAAJ', 'UerEKpIAAAAJ', 'bzOdHtwAAAAJ', 'mDMBpD

### Name Query

In [24]:
# Reference: https://stackoverflow.com/questions/50861237/is-there-an-alternative-to-difflib-get-close-matches-that-returns-indexes-l
# Return indices of the closes matches to the query
def get_close_matches_indexes(word, possibilities, n=10, cutoff=0.4):
    
    """Use SequenceMatcher to return a list of the indexes of the best 
    "good enough" matches. word is a sequence for which close matches 
    are desired (typically a string).
    possibilities is a list of sequences against which to match word
    (typically a list of strings).
    Optional arg n (default 3) is the maximum number of close matches to
    return.  n must be > 0.
    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
    that don't score at least that similar to word are ignored.
    """

    if not n >  0:
        raise ValueError("n must be > 0: %r" % (n,))
    if not 0.0 <= cutoff <= 1.0:
        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
    result = []
    s = SequenceMatcher()
    s.set_seq2(word)
    for idx, x in enumerate(possibilities):
        s.set_seq1(x)
        if s.real_quick_ratio() >= cutoff and \
           s.quick_ratio() >= cutoff and \
           s.ratio() >= cutoff:
            result.append((s.ratio(), idx))

    # Move the best scorers to head of list
    result = _nlargest(n, result)

    # Strip scores for the best n matches
    return [x for score, x in result]

# name query
name_query = "arvind"
inds = get_close_matches_indexes(name_query, loaded_prof_names)
res = [(loaded_prof_names[i],loaded_documents[i]) for i in inds]
print(res)

[('D Ravindran', {'country': 'india', 'org': '11063218275477151870', 'institute': 'Thiagarajar College of Engineering Madurai', 'name': 'D Ravindran', 'user': 'Kd7yUP4AAAAJ', 'homepage': 'Not Found', 'scholarPage': 'https://scholar.google.co.in/citations?user=Kd7yUP4AAAAJ&hl=en', 'imgLink': 'https://scholar.google.co.in/citations/images/avatar_scholar_128.png', 'verifiedEmail': True, 'subjects': ['Polymer composites', 'Polymer electrolytes', 'nano materials', 'natural fiber reinforced polymer composites'], 'yearCitations': ['{"year": 1995, "citations": 1}', '{"year": 1996, "citations": 2}', '{"year": 1997, "citations": 1}', '{"year": 1998, "citations": 0}', '{"year": 1999, "citations": 0}', '{"year": 2000, "citations": 0}', '{"year": 2001, "citations": 0}', '{"year": 2002, "citations": 0}', '{"year": 2003, "citations": 0}', '{"year": 2004, "citations": 0}', '{"year": 2005, "citations": 1}', '{"year": 2006, "citations": 0}', '{"year": 2007, "citations": 0}', '{"year": 2008, "citations":

In [23]:
print(inds)
print(loaded_prof_names)

[]
['A K Chakraborty', 'AJAY BISWAS', 'Abhijit Bhowmik', 'Abhik Majumder', 'Amitabha Nath', 'Amlanbrata Chakraborty', 'Anil S. Katarkar', 'Animesh Debnath', 'Anindita Jamatia', 'Anupam Jamatia', 'Apangshu Das', 'Apu Kumar Saha', 'Arindam Debnath', 'Arindam Majumder', 'Arnab Pal', 'Arup Ratan Bhowmik', 'Arvind Jain', 'Arvind Kumar', 'Ashmita Ghosh', 'Baby Bhattacharya', 'Biman Debbarma', 'Biplab Bhattacharjee', 'Biswajit Saha', 'DR UMESH MISHRA', 'DR. MUNESH CHANDRA TRIVEDI', 'Debanjali Nath', 'Debasish Bhattacharya', 'Dibyendu Ghoshal', 'Dipak Chandra Das', 'Diptesh Chanda', 'Dr. Abhijit Mondal', 'Dr. Abhishek Nag', 'Dr. Ardhendu Saha', 'Dr. Arup Ratan Bhowmik', 'Dr. Ashim Saha', 'Dr. Ashis Acharjee', 'Dr. Biswanath Bhunia', 'Dr. Manash Kumar Paul', 'Dr. PINKU DEBNATH', 'Dr. Rakesh Roy', 'Dr. Saurab Dhar', 'Dr. Soma Nag', 'Dr. Sujoy Chakraborty', 'Dr.Dijendra Nath Roy', 'Dr.Parthasarathi De', 'Dr.Rahul Banerjee', 'Dwijen Rudrapal', 'G.R.K. Sastry', 'Gouranga Mandal', 'Habila Basumatary