In [3]:
import pandas as pd
import numpy as np
import json
import math
from collections import Counter
from nltk import ngrams
from copy import deepcopy
import sklearn
import ast
import gensim
from tensorflow import keras
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial
import operator
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Import Datasets
train = pd.read_json('/content/drive/MyDrive/sml-authorship-attribution/Data/train.json')
test = pd.read_json('/content/drive/MyDrive/sml-authorship-attribution/Data/test.json')

In [5]:
#Identify stopwords as words that occur more than 20000 (number arrived by experimentation) in the entire corpus
abstract_words=Counter([word for lst in train['abstract'].values for word in lst]).most_common()
abstract_stop_words=[i[0] for i in abstract_words if i[1]>20000]
len(abstract_stop_words)

23

In [6]:
train['id']=train.index
train['coauthors']=train['authors']
train=train.explode('authors')
train['coauthors']=train.apply(lambda x: set(x.coauthors)-set([x.authors]),axis=1)
train=train[train['authors']<100]

In [7]:
train.shape

(8938, 7)

In [8]:
#Function to generate stopword graphs given a set of abstracts
def get_stopword_graph(texts):
    graph={}
    for text in texts:
        for i in abstract_stop_words:
            if i in text:
                if i not in graph.keys():
                    graph[i]={}
                for ind in [k for k,val in enumerate(text) if val==i]:
                    for j in set(abstract_stop_words)-set([i]):
                        if (j in text):
                            indices_j=[k for k,val in enumerate(text) if val==j]
                            distances=[abs(l-ind) for l in indices_j]
                            dist=np.exp(-abs(indices_j[distances.index(min(distances))]-ind))
                            if j in graph[i].keys():
                                graph[i][j]+=dist
                            else:
                                graph[i][j]=dist
    for i in graph.keys():
        for j in graph[i].keys():
            if j in graph.keys():
                graph[j][i]=graph[i][j]
            else:
                graph[j]={}
                graph[j][i]=graph[i][j]
    for i in abstract_stop_words:
        for j in abstract_stop_words:
            if i not in graph.keys():
                graph[i]={}
                graph[i][j]=0
                graph[j][i]=0
            if j not in graph.keys():
                graph[j]={}
                graph[j][i]=0
                graph[i][j]=0
            if j not in graph[i].keys():
                graph[i][j]=0
                graph[j][i]=0
    return graph
#Normalise the weights of the graph
def normalise_graph(graph):
    for i in graph.keys():
        sum=0
        for j in graph[i].keys():
            sum+=graph[i][j]
        if sum!=0:
            for j in graph[i].keys():
                graph[i][j]=graph[i][j]/sum
    return graph
#Calculate the kulback leiber divergence of two vectors
def kl_div(p,q):
    kl1=0
    kl2=0
    for i in p.keys():
        if p[i]!=0:
            if q[i]!=0:
                kl1+=p[i]*math.log(p[i]/q[i])
                kl2+=q[i]*math.log(q[i]/p[i])
    return (kl1+kl2)/2           
#Calculate the kulback leiber divergence of two graphs
def get_KL_divergence(G1,G2):
    G1_tmp=deepcopy(G1)
    for i in G2.keys():
        for j in G2.keys():
            if G2[i][j]==0:
                G1_tmp[i][j]=0
                G1_tmp[j][i]=0
    G1_tmp=normalise_graph(G1_tmp)
    G2=normalise_graph(G2)
    kl_divergence=0
    for i in G2.keys():
        kl_divergence+=kl_div(G1_tmp[i],G2[i])
    return kl_divergence

In [9]:
is_train=False
if is_train:
    author_df=train.groupby(['authors','year']).sample(frac=0.8, random_state=1).groupby('authors').agg(stopword_graph=('abstract',get_stopword_graph)).reset_index()
else:
    author_df=train.groupby('authors').agg(stopword_graph=('abstract',get_stopword_graph)).reset_index()

In [11]:
#Import Search space generated by the coauthor graph
test_auth=pd.read_csv('/content/drive/MyDrive/sml-authorship-attribution/test_probable_authors_df_2_test.csv')
tmp_test=test.merge(test_auth[['identifier','probable_auth']],on=['identifier'],how='left')
tmp_test.probable_auth=tmp_test.probable_auth.fillna('[]')
tmp_test.probable_auth=tmp_test.probable_auth.apply(lambda s: list(ast.literal_eval(s)))

In [12]:
#For each paper in the test set calculate the top 'n' most similar authors in the search space
res=[]
for i in range(0,tmp_test.shape[0]):
    lst={}
    potential_authors=[i for i in tmp_test.loc[i].probable_auth if i!=100]
    if len(potential_authors)==0:
        potential_authors=range(0,100)
    tmp=get_stopword_graph([tmp_test.loc[i].abstract])
    for author in potential_authors:
        lst[author]=get_KL_divergence(author_df[author_df.authors==author].stopword_graph.values[0],tmp)
    lst=lst=dict(sorted(lst.items(),key=operator.itemgetter(1)))
    # print({'identifier':i,'authors':lst})
    res.append({'identifier':i,'authors':list(lst.keys())[0:3]})
res_df=pd.DataFrame(data=res,columns=['identifier','authors'])
res_df['Predict'] = [' '.join(map(str, l)) for l in res_df['authors']]
res_df['ID']=res_df['identifier']
res_df[['ID','Predict']].to_csv('submission_classifier_1_hop.csv',index=False)