In [22]:
import pandas as pd
import urllib
import io
from bs4 import BeautifulSoup
import numpy as np

In [16]:
# Download and import data

download = False # If download is true download and save data, else just read data

if(download):
    ## Data set
    data_url = "drop"
    df = pd.read_csv(data_url)
    df.to_csv("data.csv", index = False)
    
    ## categories
    url = "http://export.arxiv.org/oai2?verb=ListSets"
    u = urllib.request.urlopen(url, data = None)
    f = io.TextIOWrapper(u,encoding='utf-8')
    text = f.read()
    soup = BeautifulSoup(text, 'xml')
    all_cat = [sp.text for sp in soup.findAll("setSpec")]

    f = open("all_cat_v01.txt", "w")
    f.write(",".join(all_cat))
    f.close()
else:
    df = pd.read_csv("data.csv")
    catfile = open("all_cat_v01.txt", "r")
    all_cat = catfile.read().split(",")

In [17]:
df.dropna(inplace = True)

In [18]:
df.drop("Unnamed: 0", axis = 1, inplace = True) # Drop the "Unnamed: 0" | this was from index
df.head()

Unnamed: 0,doi,date,title,authors,category
0,oai:arXiv.org:0704.0002,2007-03-30,Sparsity-certifying Graph Decompositions,Streinu Ileana;Theran Louis,cs
1,oai:arXiv.org:0704.0046,2007-04-01,A limit relation for entropy and channel capac...,Csiszar I.;Hiai F.;Petz D.,cs
2,oai:arXiv.org:0704.0047,2007-04-01,Intelligent location of simultaneously active ...,Kosel T.;Grabec I.,cs
3,oai:arXiv.org:0704.0050,2007-04-01,Intelligent location of simultaneously active ...,Kosel T.;Grabec I.,cs
4,oai:arXiv.org:0704.0062,2007-03-31,On-line Viterbi Algorithm and Its Relationship...,Šrámek Rastislav;Brejová Broňa;Vinař Tomáš,cs


In [26]:
# Collecting all the aurhors
au_lst = []
for paper_authors in df["authors"].values:
    for author in paper_authors.split(";"):
        au_lst.append(author)
        
# Get all the unique authors       
au_lst = list(set(au_lst))
au_lst.sort()

au_dict = {author:index for (index, author) in enumerate(au_lst)}
cat_dict = {cat:index for (index, cat) in enumerate(all_cat)}

# Creating the matrix
n = len(au_dict)
p = len(all_cat)
credit_matrix = np.zeros((n, p))

In [42]:
for index, row in df[["authors", "category"]].iterrows():
    
    author_list = row["authors"].split(";")
    contribute = 1.0/len(author_list)
    
    for author in author_list:
        try:
            credit_matrix[ au_dict[author], cat_dict[row["category"]] ] += contribute
        except KeyError as e:
            print(e)

# Calculating stuff
author_activity = credit_matrix / credit_matrix.sum(axis=1, keepdims=True)
author_weight_in_field = credit_matrix / credit_matrix.sum(axis=0, keepdims=True)
field_field_influence = np.transpose(author_activity).dot(author_weight_in_field)

proj1_df = pd.DataFrame(field_field_influence, columns = all_cat, index=all_cat)
proj1_df

  


Unnamed: 0,cs,econ,eess,math,physics,physics:astro-ph,physics:cond-mat,physics:gr-qc,physics:hep-ex,physics:hep-lat,...,physics:hep-th,physics:math-ph,physics:nlin,physics:nucl-ex,physics:nucl-th,physics:physics,physics:quant-ph,q-bio,q-fin,stat
cs,0.651584,0.047335,0.317491,0.045869,,0.00369485,0.009858,0.002849,0.004423,0.00348,...,0.002271,0.007294,0.030836,0.003093,0.002507,0.037569,0.02248,0.074944,0.047476,0.172155
econ,4.2e-05,0.326868,8.2e-05,5.1e-05,,3.298218e-07,1e-06,0.0,0.0,0.0,...,7e-06,1e-06,1e-05,0.0,0.0,9e-06,2e-06,5e-06,0.000232,0.001217
eess,0.000894,0.000261,0.331593,0.000116,,9.872947e-06,4.2e-05,4e-06,8e-06,3e-06,...,6e-06,1.6e-05,0.000117,2e-05,9e-06,0.000162,4.4e-05,0.000163,0.000132,0.000592
math,0.214964,0.267393,0.19315,0.819245,,0.01035818,0.040253,0.068572,0.008606,0.022474,...,0.11378,0.525138,0.217989,0.008413,0.014755,0.059113,0.094873,0.108034,0.258144,0.311399
physics,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
physics:astro-ph,0.005693,0.000572,0.005401,0.003405,,0.8081778,0.009184,0.150877,0.043321,0.010253,...,0.051997,0.008625,0.017397,0.04045,0.045354,0.049865,0.009356,0.007489,0.004893,0.008808
physics:cond-mat,0.015854,0.002056,0.023857,0.013813,,0.009585331,0.729312,0.017827,0.009375,0.061746,...,0.044935,0.051607,0.132771,0.019293,0.037222,0.141759,0.145637,0.151131,0.082522,0.012832
physics:gr-qc,0.001314,0.0,0.000602,0.006746,,0.04515004,0.005111,0.414458,0.004839,0.01127,...,0.114107,0.02943,0.00828,0.002684,0.0068,0.016139,0.023428,0.002872,0.003021,0.001287
physics:hep-ex,0.001032,0.0,0.000696,0.000429,,0.006562955,0.001361,0.00245,0.515325,0.01541,...,0.004482,0.000735,0.00068,0.099705,0.0244,0.018222,0.001522,0.00064,0.000926,0.00113
physics:hep-lat,0.000489,0.0,0.000166,0.000674,,0.0009354681,0.005398,0.003436,0.009281,0.468093,...,0.019784,0.003113,0.001545,0.009741,0.029398,0.001814,0.002572,0.001559,0.001616,0.00032


In [None]:
# # Creating the matrix
# n = len(au_dict)
# p = len(all_cat)
# credit_matrix = np.zeros((n, p))

# for index, row in df[["Authors", "Category"]].iterrows():
#     author_list = row["Authors"].split(";")
#     contribute = 1.0/len(au_lst)
#     for author in author_list:
#         try:
#             credit_matrix[ au_dict[author], cat_dict[row["Category"]] ] += contribute
#         except KeyError as e:
#             print(e)

# # Calculating stuff
# author_activity = credit_matrix / credit_matrix.sum(axis=1, keepdims=True)
# author_weight_in_field = credit_matrix / credit_matrix.sum(axis=0, keepdims=True)
# field_field_influence = np.transpose(author_activity).dot(author_weight_in_field)

# proj1_df = pd.DataFrame(field_field_influence, columns = all_cat, index=all_cat)
# proj1_df