In [1]:
# Reference used: https://www.geeksforgeeks.org/tf-idf-for-bigrams-trigrams/
import nltk 
import re 
import math
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from sklearn.linear_model import Ridge
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.util import ngrams
import pandas as pd   
import jsonlines
import json
from tqdm.notebook import tqdm
import pickle
from scipy.sparse import csr_matrix

In [2]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAGHAV\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAGHAV\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
documents = []
docID = {}
userIndex = {}
term_total_citations = {}
author_term_citations = {}
idx = 0
with jsonlines.open('data_india_sample.jl') as reader:
    for obj in tqdm(reader, leave=False):
        if obj['user'] not in userIndex:
            documents.append(obj)
            docID[idx] = obj['user']
            userIndex[obj['user']] = idx
            author_term_citations[obj['user']] = {}
            idx += 1

0it [00:00, ?it/s]

In [5]:
# Code Taken From: https://www.geeksforgeeks.org/tf-idf-for-bigrams-trigrams/
def remove_string_special_characters(s):
    # removes special characters with ' '
    stripped = re.sub('[^a-zA-z\s]', ' ', s)
#     print(stripped)
    stripped = re.sub('_', ' ', stripped)
      
    # Change any white space to one space
    stripped = re.sub('\s+', ' ', stripped)
      
    # Remove start and end white spaces
    stripped = stripped.strip()
    if stripped != '':
            return stripped.lower()

In [6]:
# Code Reference: https://www.geeksforgeeks.org/python-stemming-words-with-nltk/
def stem_string(sentence):
    ps = PorterStemmer()
    
    words = word_tokenize(sentence)
    words = [ps.stem(word) for word in words]
    return " ".join(words)

In [7]:
def remove_stop_words(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

In [8]:
def process_string(sentence):
    sentence = remove_string_special_characters(sentence)
    sentence = stem_string(sentence)
    sentence = remove_stop_words(sentence)
    return sentence

## TF-IDF Matrix

In [9]:
def update_term_citations(document, sentence, numCitations, userID):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
    tokens = [token for token in sentence.split(" ") if token != ""]
    for n in range(1, 5):
        seen_ngms = []
        ngms = list(ngrams(tokens, n))
        for ngm in ngms:
            gm = " ".join(ngm)
            if gm in seen_ngms:
                continue
            seen_ngms.append(gm)
            if gm in author_term_citations[userID]:
                author_term_citations[userID][gm] += numCitations
            else:
                author_term_citations[userID][gm] = numCitations
            if gm in term_total_citations:
                term_total_citations[gm] += numCitations
            else:
                term_total_citations[gm] = numCitations + 1

In [10]:
word_documents = []
for document in tqdm(documents, leave=False):
    sentences = []
    for subject in document['subjects']:
        sentence = process_string(subject)
        sentences.append(sentence)
        update_term_citations(document, sentence, int(document['citationsAll']), document['user'])
    for paper in document['papers']:
        try:
            sentence = paper['title'] + " " + paper['conference']
            sentence = process_string(sentence)
        except: 
            continue
        update_term_citations(document, sentence, paper['citations'], document['user'])
        sentences.append(sentence)

    word_documents.append(".\n".join(sentences))

  0%|          | 0/994 [00:00<?, ?it/s]

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,4), analyzer='word')
tfidf_matrix = vectorizer.fit_transform(word_documents) 
features = (vectorizer.get_feature_names())
idf_scores = vectorizer.idf_

cnt_vct = CountVectorizer(stop_words='english', ngram_range=(1,4), analyzer='word')
cnt_matrix = cnt_vct.fit_transform(word_documents)
cnt_features = cnt_vct.get_feature_names()

idf_scores = csr_matrix(idf_scores)
tfidf_new_matrix = csr_matrix.copy(csr_matrix.astype(cnt_matrix, dtype=np.float64))
for row in tqdm(range(cnt_matrix.shape[0]), leave=False):
    tfidf_new_matrix[row] = tfidf_new_matrix[row].multiply(idf_scores)
print(tfidf_new_matrix.shape)

### Multiply the Citations Log Factor

In [None]:
feature_idx = {}
for idx, feature in enumerate(cnt_features):
    feature_idx[feature] = idx

for author, author_dict in tqdm(author_term_citations.items(), leave=False):
    for term, term_citations in author_dict.items():
        if term in feature_idx:
            term_idx = feature_idx[term]
            author_idx = userIndex[author]
            if tfidf_new_matrix[author_idx, term_idx]:
                log_factor = math.log( 1 + (term_citations / term_total_citations[term]) , 2)
                tfidf_new_matrix[author_idx, term_idx] *= log_factor

# Activeness 

In [None]:
#check index orders

#rows -> professors 
#columns -> 2020, 2020+2019, 2020+2019+2018 .... cumulative no of citations

maxrng=20
minyr,maxyr=2001,2020
mat_activeness = np.zeros((len(documents),maxrng))
ind=0
for document in tqdm(documents, leave=False):
    papers=document['papers']
    for paper in papers:
        paperyr=paper['year']
        if paperyr>=minyr and paperyr<=maxyr:
            dis=maxyr-paperyr
            mat_activeness[ind,dis]+=paper['citations']
    ind+=1

ind=0
for i in range(mat_activeness.shape[0]):
    for j in range(mat_activeness.shape[1]):
        if j>0:
            mat_activeness[i,j]+=mat_activeness[i,j-1]
            

## Slope, Hindex, Institute Reputation

In [None]:
def slope_calc(citations):
    n = len(citations)
    clf = Ridge(alpha=10, fit_intercept=False)
    xinds = np.array([i for i in range(n)]).reshape(-1,1)
    xinds = np.concatenate([np.ones((n,1)),xinds],axis=1)
    clf.fit(xinds,citations)
    return clf.coef_[1]

def get_insti_ranks():
    files = ["india_institutes.txt", "britain_institutes.txt", "america_institutes.txt"]
    insti_score = dict()
    for file in files:
        path = "web_data/"+file
        with open(path,"r") as f:
            insti = f.readlines()
            instinew=[]
            for institute in insti:
                instinew.append(institute.strip())
            insti = list(filter(lambda x:x!='\n',instinew))
            n = len(insti)
            maxrank, minrank = 100,1
            diff = (maxrank-minrank)/(n-1)
            cur=maxrank
            for institute in insti:
                insti_score[institute] = cur
                cur-=diff
    return insti_score
                         
# get_insti_ranks()                         
# slope_calc([3, 9, 14, 17, 32, 0, 49, 67, 62, 0, 94, 129, 42])


In [None]:
# Columns -> slope of citations, hindex, institute reputation
ranking_metrics = np.zeros((len(documents),3))

# Slope of citations (omitting current year)
# ref: https://www.varsitytutors.com/hotmath/hotmath_help/topics/line-of-best-fit
ind=0
for document in tqdm(documents, leave=False):
    cits=[]
    for entry in document['yearCitations']:
        dictx = json.loads(entry)
        cits.append(dictx['citations'])
    ranking_metrics[ind,0]=slope_calc(cits)
    ind+=1           


# H index
ind=0
for document in tqdm(documents, leave=False):
    ranking_metrics[ind,1]=document["h-indexAll"]
    ind+=1
                                                                        

#Institute Reputation
insti_ranks = get_insti_ranks()
ind=0
for document in tqdm(documents, leave=False):
    ranking_metrics[ind,2] = insti_ranks[document['institute']]
    ind+=1
                               

### Pickle the Matrix and Vectorizer

In [None]:
pickle.dump(cnt_vct, open("web_data/count_vectorizer.pkl", "wb"))
pickle.dump(tfidf_new_matrix, open("web_data/tfidf_new_matrix.pkl", "wb"))
pickle.dump(docID, open("web_data/doc_id_dict.pkl", "wb"))
pickle.dump(mat_activeness, open("web_data/mat_activeness.pkl", "wb"))
pickle.dump(ranking_metrics, open("web_data/ranking_metrics.pkl", "wb"))

### Query Subjects

In [None]:

# def query_subject(subject="data mining"):
    
#     # tf-idf
#     query = process_string(query)
#     cnt_matrix_query = cnt_vct.transform([query])
#     cnt_matrix_query = csr_matrix.transpose(cnt_matrix_query)


#     res = tfidf_new_matrix * cnt_matrix_query
#     res = csr_matrix.transpose(res)
#     res = res.toarray()[0]
#     related_docs_indices = res.argsort()
#     related_docs_indices = related_docs_indices[::-1]
#     related_results = [docID[i] for i in related_docs_indices]
    
#     # activness
#     activeness_factor = 2
    
    
#     return related_results

In [None]:
# query_subject()

### Pickled Testing

In [None]:
loaded_cnt_vct = pickle.load(open("web_data/count_vectorizer.pkl", "rb"))
loaded_tfidf_new_matrix = pickle.load(open("web_data/tfidf_new_matrix.pkl", "rb"))
loaded_docID = pickle.load(open("web_data/doc_id_dict.pkl", "rb"))
loaded_mat_activeness = pickle.load(open("web_data/mat_activeness.pkl", "rb"))
loaded_ranking_metrics = pickle.load(open("web_data/ranking_metrics.pkl", "rb"))

In [None]:
def get_tf_idf_vec(query):
    query = process_string(query)
    cnt_matrix_query = loaded_cnt_vct.transform([query])
    cnt_matrix_query = csr_matrix.transpose(cnt_matrix_query)

    result = loaded_tfidf_new_matrix * cnt_matrix_query
    result = csr_matrix.transpose(result)
    result = result.toarray()[0]
    
    return result

def get_active_vec(year):
    col = year-1
    citations = loaded_mat_activeness[:,col].squeeze()
    return citations

def get_slope_vec():
    return loaded_ranking_metrics[:,0].squeeze()

def get_hindex_vec():
    return loaded_ranking_metrics[:,1].squeeze()
    
def get_insti_vec():
    return loaded_ranking_metrics[:,2].squeeze()

def normalise(x):
    return x/np.linalg.norm(x)


def query_subject(user_input):
    
    #parameters
    params = ['tfidf', 'active', 'hindex', 'slope', 'insti']
    ranks={}
    for param in params:
        ranks[param]={}
    
    #set weight factors
    ranks['tfidf']['wt'] = user_input['tfidf_score']
    ranks['active']['wt'] = user_input['active_score']
    ranks['hindex']['wt'] = user_input['hindex_score']
    ranks['slope']['wt'] = user_input['slope_score']
    ranks['insti']['wt'] = user_input['insti_score']
    
    #scale up 
    total_user_input_wt = 0
    for key,val in ranks.items():
        total_user_input_wt += val['wt']
    factor = 100/total_user_input_wt
    for key,val in ranks.items():
        val['wt'] *= factor
    
    #tfidf is remaining wt
    ranks['tfidf']['wt'] = 100
    for key,val in ranks.items():
        if key!='tfidf':
            ranks['tfidf']['wt']-=val['wt']
    
    print(ranks)
    
    #tf-idf
    ranks['tfidf']['vec'] = normalise(get_tf_idf_vec(user_input['query_string']))

    #activeness
    ranks['active']['vec'] = normalise(get_active_vec(user_input['active_yr']))
    
    #slope of citations
    ranks['slope']['vec'] = normalise(get_slope_vec())
    
    #h-index
    ranks['hindex']['vec'] = normalise(get_hindex_vec())
    
    #institute reputation
    ranks['insti']['vec'] = normalise(get_insti_vec())
    
    #final score
    final_sc = np.zeros(ranks['tfidf']['vec'].size)
    for key,val in ranks.items():
        final_sc += val['wt']*val['vec']
        
    indices = final_sc.argsort()
    indices = indices[::-1]
    profs = [loaded_docID[i] for i in indices]
    
    return profs

In [None]:
#generate in code using user input
input_obj = dict()
input_obj['query_string'] = 'data mining'
input_obj['active_yr'] = 5
input_obj['tfidf_score'] = 10
input_obj['active_score'] = 0
input_obj['hindex_score'] = 0
input_obj['slope_score'] = 0
input_obj['insti_score'] = 0

print(query_subject(input_obj))

In [None]:
#old code

# query="data mining"
# query = process_string(query)
# cnt_matrix_query = loaded_cnt_vct.transform([query])
# cnt_matrix_query = csr_matrix.transpose(cnt_matrix_query)

# result = loaded_tfidf_new_matrix * cnt_matrix_query
# result = csr_matrix.transpose(result)
# result = result.toarray()[0]
# related_docs_indices = result.argsort()
# related_docs_indices = related_docs_indices[::-1]
# related_results = [loaded_docID[i] for i in related_docs_indices]