In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import random as random
import math as math
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import os
import pickle
import seaborn as sns

In [None]:
file_name = 'dblp-ref-'
num_files = 4
train_data = []

for i in range(num_files):
    train_data.append(pd.read_json("./data/dblp-ref/dblp-ref-" + str(i) + ".json", lines=2))
    train_data[i].head(3)

In [None]:
# Creating the Co-author dataset from the citation network

author_data = {}
author_id = {
    'start': 1,
    'curr': 1
}

assigned_ids = {}

def create_author_data(train_data, author_data, author_id, assigned_ids):
    for i in range(0, len(train_data)):#len(train_data)):
        authors = train_data.authors[i]
        
        try:
            citations = train_data.n_citation[i]/len(authors)
        except:
            continue

        for author in authors:
            names = author.split(' ')
            unique_name = names[0] + "_" + names[len(names)-1]
            if unique_name not in author_data:
                author_data[unique_name] = {
                    'num_citations': citations,
                    'paper_count': 1,
                    'name': unique_name,
                    'author_id': author_id['curr'],
                    'co_authors': {},
                    'citations': [train_data.n_citation[i]]
                }
                assigned_ids[unique_name] = author_id['curr']
                author_id['curr'] += 1
                
            else:
                author_data[unique_name]['num_citations'] += citations
                author_data[unique_name]['paper_count'] += 1
                author_data[unique_name]['citations'].append(train_data.n_citation[i])
                
            for co_author in authors:
                co_author_names = co_author.split(' ')
                co_author_unique_name = co_author_names[0] + "_" + co_author_names[len(co_author_names)-1]
                if co_author_unique_name != unique_name:
                    author_data[unique_name]['co_authors'][co_author_unique_name] = 1
                        
            
            
# call for each data file
for i in range(num_files):
    create_author_data(train_data[i], author_data, author_id, assigned_ids)

# add average citations
for data in author_data:
    author_data[data]['average_citations'] = author_data[data]['num_citations'] / author_data[data]['paper_count']
    
# adding h-index
def get_h_index(citations):
    return ([0] + [i + 1 for i, c in enumerate(sorted(citations, reverse = True)) if c >= i + 1])[-1]

data_to_df = []
for data in author_data:
    each_author = author_data[data]
    co_authors = each_author['co_authors']
    co_author_ids = []
    co_author_avg_citations = 0
    for co_author in co_authors:
        co_author_avg_citations += author_data[co_author]['average_citations']
        co_author_ids.append(assigned_ids[co_author])
    each_author['co_authors'] = co_author_ids
    each_author['co_author_avg_citations'] = co_author_avg_citations/len(co_author_ids) if len(co_author_ids) != 0 else 0
    data_to_df.append(each_author)
    
df = pd.DataFrame.from_dict(data_to_df, orient='columns')

df['h_index'] = df.apply(lambda x: get_h_index(x.citations), axis=1)
df.to_csv('./data/authors.csv', sep=',')

df.head(10)

In [None]:
# Adding the respective domains to the each author row
authors_df = pd.read_csv('./data/authors_with_keywords.csv')
author_keyword_map = {}
co_authors_map = {}
for index, row in authors_df.iterrows():
    if not pd.isna(row['co_authors']):
#         co_authors = set(row['co_authors'].lstrip("[").rstrip("]").replace(" ", "").split(","))
        co_authors = set(map(int, row['co_authors'].lstrip("[").rstrip("]").replace(" ", "").split(",")))
        co_authors_map[row['author_id']] = co_authors
    if not pd.isna(row['keywords']):
        keywords = set(row['keywords'].split(';'))
        author_keyword_map[row['author_id']] = keywords

In [None]:
# Here we have tried to cluster the author using TF-IDF vectorization and k-menas clustering but 
# it was not giving good results so have written our own algorithm for the same. its in the next block!!!
# all_keywords = []
# useless_ketwoeds = ['local', 'based', 'approach', 'user', 'environment', 'proposed', 'method', 'proposed',
#                    'technique', 'framework', ]
# for index, row in authors_df.iterrows():
#     if not pd.isna(row['keywords']):
#         keywords = row['keywords'].split(';')
#         for keyword in keywords:
#             all_keywords.append(keyword)

# vectorizer = TfidfVectorizer(stop_words='english')
# X = vectorizer.fit_transform(all_keywords)
# true_k = 20
# model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
# model.fit(X)
# print("Top terms per cluster:")
# order_centroids = model.cluster_centers_.argsort()[:, ::-1]
# terms = vectorizer.get_feature_names()
# for i in range(true_k):
#     print("Cluster %d:" % i)
#     for ind in order_centroids[i, :10]:
#         print(' %s' % terms[ind])
#     print()

In [None]:
# Dividing the authors into clusters using their respective domain key words
idx = 0
bucket_map = {}
author_bucket_map = {}
count_buckets = 0
for index, row in authors_df.iterrows():
    idx+=1
    author_id = row['author_id']
    max_intersect_len = 0
    count_max = 0
    intersecting_sets = []
    if author_id not in author_keyword_map:
        if author_id not in co_authors_map:
            continue
        author_keyword_set = set()
        co_authors_set = co_authors_map[author_id]
        for co_author_id in co_authors_set:
            if co_author_id in author_keyword_map:
                author_keyword_set = author_keyword_set | author_keyword_map[co_author_id]
        keywords_list = random.sample(author_keyword_set, min(len(author_keyword_set), 10))
        author_keyword_map[author_id] = set(keywords_list)
    if author_id in author_keyword_map:
        keywords_set = author_keyword_map[author_id]
        for key in bucket_map:
            len_intersection = len(keywords_set.intersection(bucket_map[key]))
            if len_intersection>max_intersect_len:
                max_intersect_len = len_intersection
                count_max = 1
                intersecting_sets = []
                intersecting_sets.append(key)
            elif len_intersection==max_intersect_len:
                intersecting_sets.append(key)
                count_max+=1
        if max_intersect_len<=1:
            bucket_map[count_buckets] = keywords_set
            author_bucket_map[author_id] = count_buckets
            count_buckets+=1
        else:
            if count_max==1:
                bucket_map[intersecting_sets[0]] = bucket_map[intersecting_sets[0]] | keywords_set
                author_bucket_map[author_id] = intersecting_sets[0]
            else:
                min_size_bucket_id = -1
                min_size_buckets_count = 0
                min_size = sys.maxsize
                for intersecting_set in intersecting_sets:
                    size = len(bucket_map[intersecting_set])
                    if size==min_size:
                        min_size_buckets_count+=1
                    elif size<min_size:
                        min_size=size
                        min_size_bucket_id = intersecting_set
                        min_size_buckets_count=1
                bucket_map[min_size_bucket_id] = bucket_map[min_size_bucket_id] | keywords_set
                author_bucket_map[author_id] = min_size_bucket_id

In [None]:
final_buckets_list = [111, 122, 3, 138, 228, 946, 294, 85, 54, 168, 159, 81, 118, 344]
final_buckets_map = {111:-1, 122:-2, 3:-3, 138:-4, 228:-5, 946:-6, 294:-7, 85:-8, 54:-9, 168:-10, 159:-11, 81:-12, 118:-13, 344:-14}

In [None]:
def get_bucket_author_count_map(author_bucket_map):
    bucket_author_count_map = {}
    bucket_author_map = {}
    for key in author_bucket_map:
        value = author_bucket_map[key]
        if value in bucket_author_count_map:
            bucket_author_count_map[value]+=1
        else:
            bucket_author_count_map[value] = 1
            bucket_author_map[value] = set()
        bucket_author_map[value].add(key)
    return sorted(bucket_author_count_map.items(), key=lambda x: x[1], reverse=True ), bucket_author_map

In [None]:
def re_order_map(author_bucket_map, bucket_map, author_keyword_map, bucket_author_map, count_buckets):
    idx=0
    for bucket_id in bucket_author_map:
        if bucket_id in final_buckets_list:
            continue
        for author_id in bucket_author_map[bucket_id]:
            max_intersect_len = 0
            count_max = 0
            intersecting_sets = []
            bucket_map.pop(author_bucket_map[author_id], None)
            keywords_set = author_keyword_map[author_id]
            for bucket_key in bucket_map:
                if bucket_key not in final_buckets_list:
                    continue
                len_intersection = len(keywords_set.intersection(bucket_map[bucket_key]))
                if len_intersection>max_intersect_len:
                    max_intersect_len = len_intersection
                    count_max = 1
                    intersecting_sets = []
                    intersecting_sets.append(bucket_key)
                elif len_intersection==max_intersect_len:
                    intersecting_sets.append(bucket_key)
                    count_max+=1
            if max_intersect_len<1:
                continue
            else:
                if count_max==1:
                    bucket_map[intersecting_sets[0]] = bucket_map[intersecting_sets[0]] | keywords_set
                    author_bucket_map[author_id] = intersecting_sets[0]
                else:
                    min_size_bucket_id = -1
                    min_size_buckets_count = 0
                    min_size = sys.maxsize
                    for intersecting_set in intersecting_sets:
                        size = len(bucket_map[intersecting_set])
                        if size==min_size:
                            min_size_buckets_count+=1
                        elif size<min_size:
                            min_size=size
                            min_size_bucket_id = intersecting_set
                            min_size_buckets_count=1
                    bucket_map[min_size_bucket_id] = bucket_map[min_size_bucket_id] | keywords_set
                    author_bucket_map[author_id] = min_size_bucket_id
    return count_buckets, bucket_map, author_bucket_map

In [None]:
bucket_author_count_map, bucket_author_map = get_bucket_author_count_map(author_bucket_map)
while(len(bucket_author_count_map)>1000):
    count_buckets, bucket_map, author_bucket_map = re_order_map(author_bucket_map, bucket_map, author_keyword_map, bucket_author_map, count_buckets)
    bucket_author_count_map, bucket_author_map = get_bucket_author_count_map(author_bucket_map)
    print(len(bucket_author_count_map))

In [None]:
def add_cluster(row, author_bucket_map):
    if row['author_id'] in author_bucket_map:
        value = author_bucket_map[row['author_id']]
        if value in final_buckets_map:
            return final_buckets_map[value]
    return 9999999

authors_df['cluster_id'] = authors_df.apply(lambda x: add_cluster(x, author_bucket_map), axis=1)
authors_df.to_csv('./data/authors_with_clusters_final.csv', sep=',')
authors_df.head()

In [None]:
author_cluster_info = pd.read_csv('./data/authors_with_clusters_final.csv')

author_cluster_info = author_cluster_info[author_cluster_info.cluster_id != 9999999]

author_cluster_info.reset_index(inplace=True)
author_cluster_map = {}
for i in range(len(author_cluster_info)):
    author_cluster_map[author_cluster_info.author_name[i]] = {'author_id': author_cluster_info.author_id[i],
                                                              'cluster_id': author_cluster_info.cluster_id[i]
                                                             }
                                                             


def assign_cluster_to_paper(df, paper_cluster_map):
    for i in range(0, len(df)):#len(train_data)):
        authors = df.authors[i]
        
        try:
            citations = df.n_citation[i]/len(authors)
        except:
            continue
        
        cluster_cnt = {}
        author_id_list = []
        author_name_list = []
        for author in authors:
            names = author.split(' ')
            unique_name = names[0] + "_" + names[len(names)-1]
            if unique_name not in author_cluster_map:
                continue
            cluster_id = author_cluster_map[unique_name]['cluster_id']
            if cluster_id not in cluster_cnt:
                cluster_cnt[cluster_id] = 0
            cluster_cnt[cluster_id] += 1
            
            author_id_list.append(author_cluster_map[unique_name]['author_id'])
            author_name_list.append(unique_name)
        
        if len(author_id_list) == 0:
            continue
            
        max_cnt = -1
        for cid in cluster_cnt:
            if cluster_cnt[cid] > max_cnt:
                max_cnt = cluster_cnt[cid]
                to_assign_cluster = cid
        paper_cluster_map.append({
            'title': df.title[i],
            'paper_id': df.id[i],
            'cluster_id': to_assign_cluster,
            'authors_names': author_name_list,
            'authors_ids': author_id_list,
            'references': df.references[i]
        })
        
    
paper_cluster_map = []
for i in range(num_files):
    assign_cluster_to_paper(train_data[i], paper_cluster_map)
    

paper_data_df = pd.DataFrame(paper_cluster_map)
paper_cluster_file_name = "paper_cluster_map.csv"
paper_data_df.to_csv(paper_cluster_file_name, sep=',', encoding='utf-8')


In [None]:
# Calculating the pagerank on co-author network
train = pd.read_csv("../Data/authors_with_clusters_final.csv")
train = train.drop(columns=['Unnamed: 0', 'num_citations', 'h_index', 'paper_count', 'keywords', 'affiliation', 'citations'])
train = train.dropna(axis = 0, subset=['co_authors'])
train = train[train.co_authors != '[]']
train['author_id'] = pd.to_numeric(train['author_id'])
train.head()

def change_type(authors_list):
    return list(map(int, authors_list[1:-1].split(',')))
    
train['co_authors'] = train['co_authors'].apply(change_type)
train = train[train.cluster_id != 9999999]

cluster_grp_data = train.groupby(train['cluster_id'])

In [None]:
#  Plot graph
from matplotlib.pyplot import figure
def show_graph(G):
    figure(num=None, figsize=(32, 24), dpi=80, facecolor='w', edgecolor='k')
    pos = nx.fruchterman_reingold_layout(G) 
    nx.draw_networkx_nodes(G,pos,alpha=0.4)
    nx.draw_networkx_edges(G,pos,width=1,alpha=0.4)
    plt.show()

In [None]:
# Calculating and normalizing the pagerank for co-author network
Graph_list = []
norm_dict = {}
pagerank_dict = {}
for cluster, df in cluster_grp_data:
    G = nx.Graph()
    for i in range(len(df)):
        auth = df.iloc[i]['author_id']
        for neighbor in df.iloc[i]['co_authors']:
            if G.has_edge(auth, neighbor):
                G.add_edge(auth, neighbor, weight = G[auth][neighbor]['weight']+1)
            else:
                G.add_edge(auth, neighbor, weight = 1)
    score = nx.pagerank(G, alpha=0.55, max_iter=100, tol=1.0e-6, nstart=None, weight='weight', dangling=None)
    pagerank_dict[cluster] = score
    norm_dict[cluster] = nx.number_of_nodes(G)
    print(cluster)

nx.write_gpickle(G, "../Data/Ranking-of-Academic-Papers-master/data/Graph_author.gpickle")

In [None]:
#  Calulating and normalizing the pagerank for citation network
train_paper = pd.read_csv("../Data/paper_cluster_map.csv")
train_paper = train_paper.drop(columns=['Unnamed: 0'])
train_paper = train_paper.dropna(axis = 0, subset=['references'])

cluster_grp_data_paper = train_paper.groupby(train_paper['cluster_id'])

norm_dict_paper = {}
pagerank_paper_dict = {}
for cluster, df in cluster_grp_data_paper:   
    G1 = nx.DiGraph()
    graph_set = set()
    for i in range(len(df)):
        for neighbor in df.iloc[i]['references']:
                G1.add_edge(df.iloc[i]['paper_id'], neighbor)
                graph_set.add(neighbor)
    score = nx.pagerank(G1, alpha=0.85, tol=1.0e-6, nstart=None, weight=1, dangling=None)
    pagerank_paper_dict[cluster] = score
    norm_dict_paper[cluster] = len(graph_set)
    print(cluster)

nx.write_gpickle(G1, "../Data/Graph_paper.gpickle")

In [None]:
with open('../Data/pagerank__authordict.pickle', 'wb') as handle:
        pickle.dump(parerank_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
with open('../Data/norm_dict_author.pickle', 'wb') as handle:
        pickle.dump(norm_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../Data/pagerank_paper_dict.pickle', 'wb') as handle:
        pickle.dump(parerank_paper_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
with open('../Data/norm_dict_paper.pickle', 'wb') as handle:
        pickle.dump(norm_dict_paper, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train = pd.read_csv("../Data/authors_with_page_rank_num_citation.csv")
train = train.drop(columns=['Unnamed: 0'])
train = train.dropna()
train = train[train.paper_page_rank != -1]
train = train[train.author_page_rank != -1]


In [None]:
author_final = pd.DataFrame(columns =['author_name', 'author_id', 'h_index', 'paper_count', 'cluster_id', 'paper_page_rank', 'author_page_rank', 'num_citations', 'norm_paper_page_rank', 'norm_author_page_rank'] )
train_grp_cluster = train.groupby(['cluster_id'])

norm_dict_min = norm_dict[min(norm_dict.keys(), key=(lambda k: norm_dict[k]))]
norm_dict_paper_min = norm_dict_paper[min(norm_dict_paper.keys(), key=(lambda k: norm_dict_paper[k]))]

for cluster, df in train_grp_cluster:
    df['norm_paper_page_rank'] = df['paper_page_rank'] * norm_dict_paper[cluster] * 0.85 /10000
    df['norm_author_page_rank'] = df['author_page_rank'] * norm_dict[cluster] * 0.55 /10000
    author_final = author_final.append(df, ignore_index=True)

author_final['combined_pagerank'] = 0.75*author_final['norm_paper_page_rank'] + 0.25*author_final['norm_author_page_rank']
author_final.to_csv("../Data/authors_norm_pagerank_final.csv", ',')

In [None]:
train = pd.read_csv("../Data/authors_norm_pagerank_final.csv", ',')

In [None]:
# Finding a case which represents a failure of h-index
for i in range(len(train)-1, 0, -1):
    if train.iloc[i]['cluster_id'] == -1 and train.iloc[i]['h_index'] > 42 and train.iloc[i]['h_index'] < 45:
        scapegoat = train.iloc[i]
        break
scapegoat

compare_list = []
for i in range(len(train)):
    if train.iloc[i]['author_id'] == scapegoat['author_id']:
        continue
    if scapegoat['cluster_id'] == train.iloc[i]['cluster_id'] and scapegoat['h_index'] > train.iloc[i]['h_index'] and train.iloc[i]['paper_count'] > 1.5*scapegoat['paper_count']:
        compare_list.append(train.iloc[i])
print(compare_list)

In [None]:
#  Top 100 authors for each cluster
train_grp_cluster = train1.groupby(['cluster_id'])
for cluster, df in train_grp_cluster:
    df = df.sort_values(by=['combined_pagerank'], ascending=False)
    train_top = df.iloc[:100][['author_name', 'h_index', 'combined_pagerank']]
    train_top.to_csv("../Data/top100/top100_" +str(-1*cluster) + ".csv", ",")

In [None]:
#  Calulating and normalizing the pagerank for citation network
author_papers = pd.read_csv("../Data/author_papers_map.csv")
train_paper = train_paper.dropna(axis = 0, subset=['papers'])

paper_authors = pd.read_csv("../Data/papers_author_map.csv")
train_paper = train_paper.dropna(axis = 0, subset=['papers'])

rho = 1
R_time = {}

for i in range(len(paper_authors))
    R_time[paper_authors.iloc[i]['paper_id']] = np.exp(-1*rho*(2018 - paper_authors.iloc[i]['year']))

G_author = nx.Graph()
for i in range(len(author_papers)):
    G_author.add_edge(author_papers.iloc[i]['author_id'], author_papers.iloc[i]['papers'])

delta = 1.0
espilon = 1.0e-6
alpha = 0.25
beta = 0.25
gamma = 0.25
theta = 1 - alpha - beta - gamma
RA = {}
RP = {}
size = G1.number_of_nodes()
while delta > epsilon:
        new_ranks = {}
        for author in G.nodes():
            for paper in author_papers.iloc[author]['papers']:
                RA[author] += pagerank_paper_dict[paper]
        for paper in G1.nodes():
            temp = 0
            for author in paper_authors.iloc[paper]['authors']:
                temp += RA[author]
            RP = alpha*pagerank_paper_dict[paper] + beta*temp + gamma*R_time + theta*(1/size)
            
with open('../Data/RA.pickle', 'wb') as handle:
        pickle.dump(RA, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../Data/RP.pickle', 'wb') as handle:
        pickle.dump(RP, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
#  Reach of a paper
author_pagerank = pd.read_pickle('./Data/author_page_rank.pickle')
paper_pagerank = pd.read_pickle('./Data/paper_page_rank.pickle')

authorId_pagerank_map = {}
for cid in author_pagerank:
    for author_id in author_pagerank[cid]:
        authorId_pagerank_map[author_id] = author_pagerank[cid][author_id]
authorId_pagerank_map

paperId_pagerank_map = {}
for cid in paper_pagerank:
    print(cid)
    print(len(paper_pagerank[cid]))
    for paper_id in paper_pagerank[cid]:
        pid = paper_id.strip(' ')
        #pid = pid.strip('"')
        pid = pid.strip("'")
        paperId_pagerank_map[pid] = paper_pagerank[cid][paper_id]
        
paperId_pagerank_map


paper_data_df = paper_data_df.dropna(axis=0, subset=["references"])
paper_data_with_pagerank = paper_data_df.copy()
del paper_data_with_pagerank['references']


def add_paper_pagerank(row):
    #if row.paper_id in paperId_pagerank_map:
    return paperId_pagerank_map[row.paper_id] * 10000

paper_data_with_pagerank['pagerank'] = paper_data_with_pagerank.apply(lambda row: add_paper_pagerank(row), axis=1)
#paper_data_with_pagerank['pagerank'] = paper_data_with_pagerank.apply(lambda row: row['paper_id'],  axis=1)

def add_authors_pagerank_sum(row):
    sum_score = 0
    #print(row.authors_ids)
    for aid in list(row.authors_ids[1:-1].split(',')):
        author_id = int(aid)
        if author_id in authorId_pagerank_map:
            sum_score += authorId_pagerank_map[author_id]
    return sum_score
paper_data_with_pagerank['author_pagerank'] = paper_data_with_pagerank.apply(lambda row: add_authors_pagerank_sum(row), axis=1)

paper_data_with_pagerank


paper_pg_file_name = "paper_pagerank_author_pagerank.csv"
paper_data_with_pagerank.to_csv(paper_pg_file_name, sep=',', encoding='utf-8')


In [None]:
train_paper = pd.read_csv("../Data/paper_pagerank_author_pagerank.csv")
train_paper = train_paper.dropna()
train_paper = train_paper.drop(columns=['Unnamed: 0'])
train_paper = train_paper[train_paper.pagerank != -1]
train_paper = train_paper[train_paper.author_pagerank != -1]
train_grp_cluster = train.groupby(['cluster_id'])

In [None]:
paper_final = pd.DataFrame(columns =['authors_ids', 'authors_names', 'cluster_id', 'paper_id', 'title','pagerank', 'author_pagerank', 'norm_paper_page_rank', 'norm_author_page_rank'])
train_paper_grp_cluster = train_paper.groupby(['cluster_id'])

for cluster, df in train_paper_grp_cluster:
    df['norm_paper_page_rank'] = df['pagerank'] * norm_dict_paper[cluster] * 0.85 / 10000
    df['norm_author_page_rank'] = df['author_pagerank'] * norm_dict[cluster] * 0.55 / 10000
    paper_final = paper_final.append(df, ignore_index=True)
    

paper_final['combined_pagerank'] = 0.75*paper_final['norm_paper_page_rank'] + 0.25*paper_final['norm_author_page_rank']
paper_final.to_csv("../Data/papers_norm_pagerank_final.csv", ',')

In [None]:
train2 = pd.read_csv("../Data/papers_norm_pagerank_final.csv", ',')

with open("../Data/paper_citation.pickle", 'rb') as file:
    paper_dict=pickle.load(file)

In [None]:
def find_citations(paper):
    if paper in paper_dict:
        return paper_dict[paper]

train2['total_citations'] = train2['paper_id'].apply(find_citations)

#  10 papers with most reach in each cluster
train_paper_grp_cluster = train2.groupby(['cluster_id'])
for cluster, df in train_paper_grp_cluster:
    df = df.sort_values(by=['combined_pagerank'], ascending=False)
    train_top = df.iloc[:10][['paper_id', 'title', 'total_citations', 'combined_pagerank']]
    train_top['combined_pagerank'] = train_top['combined_pagerank'].round(3)
    train_top.to_csv("../Data/top100/paper_top100_" +str(-1*cluster) + ".csv", ",")
    


In [None]:
# Comparison between h-index and P^2 score

plt.rcParams.update({'font.size': 30})
fig, ax1 = plt.subplots(figsize=(30,8))
ax1.plot(train_top_dict[-5].iloc[:20]['author_name'], train_top_dict[-5].iloc[:20]['combined_pagerank'], '-bD', label="$p^{2} score", linewidth=2.0)
plt.xticks(rotation=90)
ax2 = ax1.twinx()
ax2.plot(train_top_dict[-5].iloc[:20]['author_name'], train_top_dict[-5].iloc[:20]['h_index'], '-rD', label="h-index", linewidth=2.0)
plt.xticks(rotation=90)
plt.xlabel('Authors')
ax1.set_ylabel('$p^{2}$ score', weight="bold")
ax2.set_ylabel('h-index', weight="bold")
ax1.legend()
ax2.legend(loc='best', bbox_to_anchor=(0.5, 0.5, 0.5, 0.38))
# legend = ax2.legend(loc='upper center', shadow=True)
# legend(('label1', 'label2'))
plt.title('Comparision between the $p^{2}$ score and h-index for the top 20 authors of domain 5', weight="bold")
plt.savefig('../plots/compare_domain5.png', bbox_inches='tight')
plt.show()


In [None]:
# Bar plot

train = train.sort_values(by=['combined_pagerank'], ascending=False)
top_by_cluster = train.iloc[:1000].groupby(['cluster_id'])
count_dict_p2 = {}
for cluster, df in top_by_cluster:
    count_dict_p2[cluster*-1] = len(df)
print(count_dict_p2)

train = train.sort_values(by=['h_index'], ascending=False)
top_by_cluster = train.iloc[:1000].groupby(['cluster_id'])
count_dict_h = {}
for cluster, df in top_by_cluster:
    count_dict_h[cluster*-1] = len(df)
print(count_dict_h)

count_dict_p2[2]=121
count_dict_p2[3] =200
plt.rcParams.update({'font.size': 30})
bar_width = 0.2
x = np.arange(1,11,1)
fig, ax = plt.subplots(figsize=(20,8))
rect1 = ax.bar(x - bar_width, list(count_dict_p2.values())[::-1][:10], width=0.4, color='y', align='center', label='$p^{2}$ score')
rect2 = ax.bar(x + bar_width, list(count_dict_h.values())[::-1][:10], width=0.4, color='b', align='center', label="h-index")
plt.xlabel('Cluster_Id', weight="bold", fontsize=20)
plt.ylabel('Number of authors', weight="bold",fontsize=20)
plt.title('Comparison between number of author in the top 1000 by the $p^{2}$ score and h-index domain wise', weight="bold", fontsize=20)
ax.legend((rect1[0], rect2[0]), ('$p^{2}$ score', 'h-index'), fontsize=20)
plt.xticks(fontsize=20, weight="bold")
plt.yticks(fontsize=20, weight="bold")
plt.savefig('../plots/top_clusrer_compare.png', bbox_inches='tight')
plt.show()

In [None]:
# co-rrelation values

# finding correlation co-eff
from scipy.stats import pearsonr, spearmanr
r1, p_value1 = pearsonr(train1['combined_pagerank'], train1['h_index'])
r2, p_value2 = spearmanr(train1['combined_pagerank'], train1['h_index'])
print(r1,p_value1, r1*r1)
print(r2,p_value2, r2*r2)