In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML


In [2]:
dataset = {}
dataset['paper'] = pd.read_csv('dataRev2/Paper.csv').fillna("")
dataset['author'] = pd.read_csv('dataRev2/Author.csv').fillna("")
dataset['conference'] = pd.read_csv('dataRev2/Conference.csv').fillna("")
dataset['journal'] = pd.read_csv('dataRev2/Journal.csv').fillna("")
dataset['paper_author'] = pd.read_csv('dataRev2/PaperAuthor.csv').fillna("")

In [67]:
def parse_paper_ids(paper_ids_string):
    return paper_ids_string.strip().split()

def parse_targetset(targetset):
    pair_list = []
    author_id_list = targetset['AuthorId']

    for i in range(len(author_id_list)):
        author_id = author_id_list[i]
        papers = targetset[targetset.AuthorId == author_id]['PaperIds'].unique()[0]
        papers = parse_paper_ids(papers)
        for j in range(len(papers)):
            paper_id = int(papers[j])
            pair_list.append( (author_id, paper_id) )
    return list(set(pair_list))

def generate_feature_list(author_paper_pairs, ap_to_feature_list):
    result_list = []

    temp_dict = {} # { (author, paper) => [f1, f2 ...] }
    for ap_pair in author_paper_pairs:
        temp_dict[ap_pair] = []

    for i in range(len(ap_to_feature_list)):
        feature_dict = ap_to_feature_list[i]
        for ap_pair in author_paper_pairs:
            feature = feature_dict[ap_pair]
            temp_dict[ap_pair].append(feature)

    for key in temp_dict.keys():
        result_list.append(key + tuple( temp_dict[key] ))

    return result_list

def lcs(X , Y):
    # find the length of the strings
    m = len(X)
    n = len(Y)
 
    # declaring the array for storing the dp values
    L = np.zeros((m+1,n+1))
 
    """Following steps build L[m+1][n+1] in bottom up fashion
    Note: L[i][j] contains length of LCS of X[0..i-1]
    and Y[0..j-1]"""
    for i in range(m+1):
        for j in range(n+1):
            if i == 0 or j == 0 :
                L[i][j] = 0
            elif X[i-1] == Y[j-1]:
                L[i][j] = L[i-1][j-1]+1
            else:
                L[i][j] = max(L[i-1][j] , L[i][j-1])
 
    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    return L[m][n]

def get_coauthor_aff(pa_data,pid):
    related_authors = pa_data[pa_data['PaperId'] == pid]['Affiliation']
    return related_authors.unique()

def kamil_feature_11(dataset, author_paper_pairs):
    feature_dict = {}
    pa_data = dataset["paper_author"].set_index('AuthorId')
    
    print("Welcome to Kamil's feature!")
    print("AP shape",len(author_paper_pairs))
    print(author_paper_pairs[:2])
    
    for ap in author_paper_pairs:
        #print("PA pair",ap)
        #paper_aff = pa_data.loc[(pa_data.loc[ap[0]]["PaperId"] == ap[1])]["Affiliation"].unique()
        #print("Paper aff--",paper_aff,"--")
        
        ta_aff = " ".join(pa_data.loc[ap[0]]["Affiliation"].unique())
        #print("Target aff--",ta_aff,"--")
        coa_aff = get_coauthor_aff(pa_data, ap[1])
        value = 0
        for i,aff in enumerate(coa_aff):
            if(aff != ""):
         #       print(i,"Co auth aff::",aff,"::")
                value += lcs(aff, ta_aff)
        feature_dict[ap] = value
        print(ap," -> ",value)
    return feature_dict

def get_features(dataset, targetset):
    author_paper_pairs = parse_targetset(targetset)

    # Keep the format of f# (dictionary): { (a1, p1): feature_value1, (a2, p2): feature_value2 ... }
    # Add your features here and add them to feature_list!
    kamil_f1 = kamil_feature_11(dataset, author_paper_pairs)
    kamil_list = [kamil_f1]
    
    thao_list,harry_list = [],[]
    feature_list = harry_list + kamil_list + thao_list

    result_list = generate_feature_list(author_paper_pairs, feature_list)
    return result_list

In [68]:
#sample_l = [(1539933, 1359549), (1455231, 467172), (1215636, 1791266), (205278, 1737961), (433821, 1901940)]
#521630,1 + 972575,1
#kamil_feature_11(dataset,sample_l)
#toy = dataset["paper_author"].set_index('AuthorId')
#toy.loc[972575]["PaperId"] == 1

AuthorId
972575     True
972575    False
972575    False
972575    False
972575    False
972575    False
Name: PaperId, dtype: bool

In [69]:
trainset = pd.read_csv('dataRev2/Train.csv')
train_confirmed = trainset[['AuthorId', 'ConfirmedPaperIds']].rename(columns = {'ConfirmedPaperIds':'PaperIds'})
#train_confirmed.head(5)
features_conf = get_features(dataset, train_confirmed.head(5))


Welcome to Kamil's feature!
AP shape 126
[(826, 118648), (826, 1236661)]


KeyboardInterrupt: 

In [32]:
train_confirmed = train_confirmed.set_index('AuthorId')
train_confirmed.head()

Unnamed: 0_level_0,PaperIds
AuthorId,Unnamed: 1_level_1
826,25733 47276 77012 79468 87141 101385 104556 11...
933,1739240
1118,49963 93433 341015 415282 488635 517119 521922...
2783,154377 212636 334024 350747 696269 704564 1241...
3105,603562 647574 792910 844605 863071 878798 1082...


In [None]:
def find_relation(author_id,paper_id):
    relation_list = pd.DataFrame()
    paper_author = dataset['paper_author']
    pa_only = dataset['paper_author'][['PaperId','AuthorId']]
    #find papers related to the author
    related_papers = pa_only[pa_only.AuthorId == author_id]['PaperId'].unique()
    print("# related papers:",related_papers.shape)
    
    #find authors related to the paper
    related_authors = paper_author[paper_author['PaperId'] == paper_id]['AuthorId'].unique()
    print("# related authors:",related_authors.shape)
    
    #remove target paper and target author from lists
    related_papers = np.delete(related_papers,np.argwhere(related_papers == paper_id))
    related_authors = np.delete(related_authors, np.argwhere(related_authors == author_id))
    
    print("# related papers:",related_papers.shape)
    print("# related authors:",related_authors.shape)
    cnt = 0
    for author in related_authors:
        for paper in related_papers:
            single_relation = pa_only[(pa_only['PaperId'] == paper) & (pa_only['AuthorId'] == author)]
            cnt += single_relation.size
            
    print(author_id,paper_id,"->",cnt)
    
    return cnt


In [None]:
%time find_relation(826,25733)