In [1]:
import pickle
import data_io
import pandas as pd
import numpy as np
from feature_set import *
from collections import defaultdict
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
stopword = set(stopwords.words('english'))
porter = PorterStemmer()

#from Thao_features import *
dataset = {}
dataset['paper'] = pd.read_csv('dataRev2/Paper.csv')[:2000]
trainset = pd.read_csv('dataRev2/Train.csv')
train_confirmed = trainset[['AuthorId', 'ConfirmedPaperIds']].rename(columns = {'ConfirmedPaperIds':'PaperIds'})
targetset = train_confirmed
dataset['paper_author'] = pd.read_csv('dataRev2/PaperAuthor1.csv')



In [2]:
#for keyword of Papers
def filter_keyword(text):
    for i in string.punctuation: text = text.replace(i,' ')
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words = [w for w in words if w not in  ["keywords"] ]
    stemmed = [porter.stem(w) for w in words]
    return stemmed

# for title of papers
def tokenize(text):
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words =[w for w in words if  not w in stopword]
    stemmed = [porter.stem(w) for w in words]
    return stemmed

#return the keywords of each paper
def paper_keywords(data):
    paper = data['paper']
    paperid = list(paper["Id"])
    paper_keyword = defaultdict(list)

    paper= paper.set_index("Id")
    paper['Keyword']= paper['Keyword'].fillna("")
    paper['Title']= paper['Title'].fillna("")
    title = list(paper["Title"])
    paper['Token'] = paper.Title.map(tokenize)
    paper['Keyword_pro'] = paper['Keyword'].map(filter_keyword)

    #concatenate keyword and token
    paper['Key_token'] = paper[['Keyword_pro','Token']].apply((lambda x: ' '.join(list(set([i for z in x for i in z])))), axis =1)
    token = list(paper['Key_token'])
    count = CountVectorizer(min_df = 5) #only take words with df > 5
    count_token =count.fit_transform(token).toarray() #2000*527
    vocab = list(count.vocabulary_.keys())
    #list of common words in each title of each document
    paper['Common word'] = paper['Key_token'].map(lambda x: [i for i in x.split() if i in vocab])
    for i in paperid:
        paper_keyword[i] = paper.loc[i,'Common word']
    return paper_keyword

#how similar two documents are based on keywords
def common_word(word1, word2):
    for i in word1:
        if i in word2:
            sim += 1
    return sim


def target_paper_and_papers_of_target_author_by_keywords(dataset,author_paper_pairs):
    paper_sim = defaultdict(int)
    trainset = dataset['paper_author']
    keyword = paper_keywords(dataset)
    
    for i in author_paper_pairs:
        trained_paper= list(trainset.loc[trainset["AuthorId"]== i[0], "PaperId"])
        paper_sim[i] = sum(common_word(keyword[i[1]],keyword[j]) for j in trained_paper)
    return paper_sim

def parse_paper_ids(paper_ids_string):
    return paper_ids_string.strip().split()

def parse_targetset(targetset):
    pair_list = []
    author_id_list = targetset['AuthorId']

    for i in range(len(author_id_list)):
        author_id = author_id_list[i]
        papers = targetset[targetset.AuthorId == author_id]['PaperIds'].unique()[0]
        papers = parse_paper_ids(papers)
        for j in range(len(papers)):
            paper_id = int(papers[j])
            pair_list.append( (author_id, paper_id) )
    return list(set(pair_list))

In [3]:
author_paper_pairs = parse_targetset(targetset)


In [4]:
thao_f3 = target_paper_and_papers_of_target_author_by_keywords(dataset,author_paper_pairs[:7])

In [5]:
thao_f3

defaultdict(int,
            {(205278, 1737961): 0,
             (433821, 1901940): 0,
             (1215636, 1791266): 0,
             (1455231, 467172): 0,
             (1539933, 1359549): 0,
             (1589984, 2059890): 0,
             (1794805, 1802876): 0})

In [26]:
def process_aff(text):
    for i in string.punctuation: text = text.replace(i,' ')
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words = [w for w in words if w not in list(stopword)]
    words = [w for w in words if w not in  ["institute","univ",'university',"college","department","science","technology",
                                            "de","engineer","lab","dept","falcuty",] ]
    stemmed = [porter.stem(w) for w in words]
    return stemmed
def author_affiliation(data):
    pa = data["paper_author"]
    affiliation = defaultdict(str)
    
    pa['Affiliation'] = pa['Affiliation'].fillna("")
    pa_1 = pd.DataFrame(pd.pivot_table(pa, values = "Affiliation",index = ["AuthorId"], aggfunc = "sum"))
    author = list(pa_1.index)
    for i in author:
        affiliation[i]= process_aff(pa_1.loc[i,"Affiliation"])
    return affiliation

# more efficient approach 
def target_author_and_coauthor_of_target_paper_by_affiliation(dataset,author_paper_pairs):
    pa = dataset["paper_author"]
    aff = author_affiliation(dataset)
    author_sim = defaultdict(int)
    for i in author_paper_pairs:
        coauthor= list(pa.loc[pa["PaperId"]== i[1], "AuthorId"])
        author_sim[i] = sum(common_word(aff[i[0]],aff[j]) for j in coauthor)
    return author_sim


In [29]:
author_affiliation(dataset)

defaultdict(str,
            {1229483: [],
             188421: ['programa',
              'de',
              'engenharia',
              'de',
              'sistema',
              'e',
              'copp',
              'universidad',
              'feder',
              'rio',
              'de',
              'janeiro',
              'brazil'],
             1245191: ['microsoft', 'research'],
             290824: [],
             1105929: ['faculti',
              'pharmaci',
              'tehran',
              'medic',
              'scienc',
              'tehran',
              'iran'],
             122892: [],
             1466383: [],
             757776: [],
             1024537: ['physic',
              'california',
              'lo',
              'angel',
              'hilgard',
              'ave',
              'lo',
              'angel',
              'ca',
              'usa'],
             2261013: [],
             1835034: ['dept', 'comput', 'columbia', 'usa

In [27]:
pa = dataset["paper_author"]
if True:
    affiliation = defaultdict(str)
    
    pa['Affiliation'] = pa['Affiliation'].fillna("")
    pa_1 = pd.DataFrame(pd.pivot_table(pa, values = "Affiliation",index = ["AuthorId"], aggfunc = "sum"))
    author = list(pa_1.index)
    for i in author:
        #print (type(pa_1.loc[i,"Affiliation"]))
        affiliation[i]= process(str(pa_1.loc[i,"Affiliation"]))


In [28]:
affiliation

defaultdict(str,
            {1229483: [],
             188421: ['programa',
              'de',
              'engenharia',
              'de',
              'sistema',
              'e',
              'copp',
              'universidad',
              'feder',
              'rio',
              'de',
              'janeiro',
              'brazil'],
             1245191: ['microsoft', 'research'],
             290824: [],
             1105929: ['faculti',
              'pharmaci',
              'tehran',
              'medic',
              'scienc',
              'tehran',
              'iran'],
             122892: [],
             1466383: [],
             757776: [],
             1024537: ['physic',
              'california',
              'lo',
              'angel',
              'hilgard',
              'ave',
              'lo',
              'angel',
              'ca',
              'usa'],
             2261013: [],
             1835034: ['dept', 'comput', 'columbia', 'usa