In [108]:
import networkx as nx
import numpy as np
import pandas as pd
import itertools

import matplotlib.pyplot as plt

In [156]:
cat_path  = "cat.csv"
data_path = "data.csv"

all_cat = open(cat_path, 'r').read().split(",")
all_cat.sort()
# Category dictionary
cat_dict = {cat:index for (index, cat) in enumerate(all_cat)}

df = pd.read_csv(data_path)

# delete the NAs
df.dropna(inplace = True)

# delete the duplicated paper
df.drop_duplicates(subset = "paper ID", inplace = True)

In [157]:
df.head()

Unnamed: 0,paper ID,Title,Authors,Category
0,http://arxiv.org/abs/0704.1711v2,"Dynamical Equilibrium, trajectories study in a...",Patrick Letrémy;Marie Cottrell;Patrice Gaubert...,stat
1,http://arxiv.org/abs/0704.3474v1,Missing Data: A Comparison of Neural Network a...,Fulufhelo V. Nelwamondo;Shakir Mohamed;Tshilid...,stat
2,http://arxiv.org/abs/0704.3862v1,An Integrated Human-Computer System for Contro...,Tshilidzi Marwala;Monica Lagazio;Thando Tettey,stat
3,http://arxiv.org/abs/0706.0073v1,Modeling Hourly Ozone Concentration Fields,Yiping Dou;Nhu D Le;James V Zidek,stat
4,http://arxiv.org/abs/0706.1401v1,Controlling for individual heterogeneity in lo...,J. R. Lockwood;Daniel F. McCaffrey,stat


In [158]:
# Collecting all the aurhors
au_lst = []
for paper_authors in df["Authors"].values:
    for author in paper_authors.split(";"):
        au_lst.append(author)
        
# Get all the unique authors       
au_lst = list(set(au_lst))
au_lst.sort()

au_dict = {author:index for (index, author) in enumerate(au_lst)}

In [159]:
# Creating the matrix
n = len(au_dict)
p = len(all_cat)
credit_matrix = np.zeros((n, p))

for index, row in df[["Authors", "Category"]].iterrows():
    author_list = row["Authors"].split(";")
    contribute = 1.0/len(au_lst)
    for author in author_list:
        credit_matrix[ au_dict[author], cat_dict[row["Category"]] ] += contribute

# Calculating stuff
author_activity = credit_matrix / credit_matrix.sum(axis=1, keepdims=True)
author_weight_in_field = credit_matrix / credit_matrix.sum(axis=0, keepdims=True)
field_field_influence = np.transpose(author_activity).dot(author_weight_in_field)

proj1_df = pd.DataFrame(field_field_influence, columns = all_cat, index=all_cat)
proj1_df

Unnamed: 0,astro-ph,cond-mat,cs,econ,eess,gr-qc,hep-ex,hep-lat,hep-ph,hep-th,math,math-ph,nlin,nucl-ex,nucl-th,physics,q-bio,q-fin,quant-ph,stat
astro-ph,0.961905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001493,0.0,0.0,0.0,0.0
cond-mat,0.0,0.992021,0.000628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001866,0.0,0.0,0.0,0.0
cs,0.0,0.00266,0.99196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008033,0.0,0.004348,0.0,0.0,0.0,0.00223,0.0,0.0,0.004762
econ,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eess,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gr-qc,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hep-ex,0.0,0.0,0.0,0.0,0.0,0.0,0.958333,0.0,0.0,0.0,0.0,0.0,0.0,0.002747,0.0,0.0,0.0,0.0,0.0,0.0
hep-lat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hep-ph,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hep-th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.966667,0.000949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
