In [21]:
import pickle
import data_io
import pandas as pd
import numpy as np
from feature_set import *
from collections import defaultdict
import string
import time 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
stopword = set(stopwords.words('english'))
porter = PorterStemmer()

#from Thao_features import *
dataset = {}
dataset['paper'] = pd.read_csv('dataRev2/Paper.csv')[:2000]
trainset = pd.read_csv('dataRev2/Train.csv')
train_confirmed = trainset[['AuthorId', 'ConfirmedPaperIds']].rename(columns = {'ConfirmedPaperIds':'PaperIds'})
targetset = train_confirmed
dataset['paper_author'] = pd.read_csv('dataRev2/PaperAuthor1.csv')



In [27]:
#for keyword of Papers
def filter_keyword(text):
    for i in string.punctuation: text = text.replace(i,' ')
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words = [w for w in words if w not in  ["keywords"] ]
    stemmed = [porter.stem(w) for w in words]
    return stemmed

# for title of papers
def tokenize(text):
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words =[w for w in words if  not w in stopword]
    stemmed = [porter.stem(w) for w in words]
    return stemmed

#return the keywords of each paper
def paper_keywords(data):
    paper = data['paper']
    paperid = list(paper["Id"])
    paper_keyword = defaultdict(list)

    paper= paper.set_index("Id")
    
    paper['Keyword']= paper['Keyword'].fillna("")
    paper['Title']= paper['Title'].fillna("")

    title = list(paper["Title"])
    cnt = 0
    start_time = time.time()
    titleTokens = []

    print("Start title!!!")
    for t in title:
        cnt += 1
        if (cnt % 100000 == 0):
            print("Count: ", cnt)
            print("Time: ", time.time() - start_time)
        titleTokens.append(tokenize(t))
    paper['Token'] = titleTokens


    #paper['Token'] = paper.Title.map(tokenize)
    print ("Start keyword!!!")

    keywords = list(paper['Keyword'])
    cnt2 = 0
    keywordTokens = []
    for k in keywords:
        cnt2 += 1
        if (cnt2 % 100000 == 0):
            print("Count: ", cnt)
            print("Time: ", time.time() - start_time)
        keywordTokens.append(filter_keyword(k))

    paper['Keyword_pro'] = keywordTokens

    print("Start concatenation!!!")

    #TODO: change all "apply", "map" functions to explicit for loop. Don't use "for loop" in the list because it causes memory limit error.
    #concatenate keyword and token
    keyToken = []
    for i in paperid:
        keyToken.append(list(set(paper.loc[i, 'Keyword_pro'] + paper.loc[i, 'Token'])))
    paper['Key_token'] = keyToken
    for i in paperid:
        paper_keyword[i] = paper.loc[i,'Key_token']

    pickle.dump(paper_keyword, open(data_io.get_paths()["paper_title_tokens"], 'wb'))
    return paper_keyword

#how similar two documents are based on keywords
def common_word(word1, word2):
    for i in word1:
        if i in word2:
            sim += 1
    return sim


def target_paper_and_papers_of_target_author_by_keywords(dataset,author_paper_pairs):
    paper_sim = defaultdict(int)
    trainset = dataset['paper_author']
    keyword = paper_keywords(dataset)
    
    for i in author_paper_pairs:
        trained_paper= list(trainset.loc[trainset["AuthorId"]== i[0], "PaperId"])
        paper_sim[i] = sum(common_word(keyword[i[1]],keyword[j]) for j in trained_paper)
    return paper_sim

def parse_paper_ids(paper_ids_string):
    return paper_ids_string.strip().split()

def parse_targetset(targetset):
    pair_list = []
    author_id_list = targetset['AuthorId']

    for i in range(len(author_id_list)):
        author_id = author_id_list[i]
        papers = targetset[targetset.AuthorId == author_id]['PaperIds'].unique()[0]
        papers = parse_paper_ids(papers)
        for j in range(len(papers)):
            paper_id = int(papers[j])
            pair_list.append( (author_id, paper_id) )
    return list(set(pair_list))

In [3]:
author_paper_pairs = parse_targetset(targetset)


In [4]:
thao_f3 = target_paper_and_papers_of_target_author_by_keywords(dataset,author_paper_pairs[:7])

In [5]:
thao_f3

defaultdict(int,
            {(205278, 1737961): 0,
             (433821, 1901940): 0,
             (1215636, 1791266): 0,
             (1455231, 467172): 0,
             (1539933, 1359549): 0,
             (1589984, 2059890): 0,
             (1794805, 1802876): 0})

In [3]:
def process_aff(text):
    for i in string.punctuation: text = text.replace(i,' ')
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words = [w for w in words if w not in list(stopword)]
    words = [w for w in words if w not in  ["institute","univ",'university',"college","department","science","technology",
                                            "de","engineer","lab","dept","falcuty",] ]
    stemmed = [porter.stem(w) for w in words]
    return stemmed
def author_affiliation(data):
    pa = data["paper_author"]
    affiliation = defaultdict(str)
    
    pa['Affiliation'] = pa['Affiliation'].fillna("")
    pa_1 = pd.DataFrame(pd.pivot_table(pa, values = "Affiliation",index = ["AuthorId"], aggfunc = "sum"))
    author = list(pa_1.index)
    for i in author:
        affiliation[i]= process_aff(pa_1.loc[i,"Affiliation"])
    return affiliation

# more efficient approach 
def target_author_and_coauthor_of_target_paper_by_affiliation(dataset,author_paper_pairs):
    pa = dataset["paper_author"]
    aff = author_affiliation(dataset)
    author_sim = defaultdict(int)
    for i in author_paper_pairs:
        coauthor= list(pa.loc[pa["PaperId"]== i[1], "AuthorId"])
        author_sim[i] = sum(common_word(aff[i[0]],aff[j]) for j in coauthor)
    return author_sim


In [29]:
author_affiliation(dataset)

defaultdict(str,
            {1229483: [],
             188421: ['programa',
              'de',
              'engenharia',
              'de',
              'sistema',
              'e',
              'copp',
              'universidad',
              'feder',
              'rio',
              'de',
              'janeiro',
              'brazil'],
             1245191: ['microsoft', 'research'],
             290824: [],
             1105929: ['faculti',
              'pharmaci',
              'tehran',
              'medic',
              'scienc',
              'tehran',
              'iran'],
             122892: [],
             1466383: [],
             757776: [],
             1024537: ['physic',
              'california',
              'lo',
              'angel',
              'hilgard',
              'ave',
              'lo',
              'angel',
              'ca',
              'usa'],
             2261013: [],
             1835034: ['dept', 'comput', 'columbia', 'usa

In [27]:
pa = dataset["paper_author"]
if True:
    affiliation = defaultdict(str)
    
    pa['Affiliation'] = pa['Affiliation'].fillna("")
    pa_1 = pd.DataFrame(pd.pivot_table(pa, values = "Affiliation",index = ["AuthorId"], aggfunc = "sum"))
    author = list(pa_1.index)
    for i in author:
        #print (type(pa_1.loc[i,"Affiliation"]))
        affiliation[i]= process(str(pa_1.loc[i,"Affiliation"]))


In [7]:
if True:
    pa['Affiliation'] = pa['Affiliation'].fillna("")
    pa_1 = pd.DataFrame(pd.pivot_table(pa, values = "Affiliation",index = ["AuthorId"], aggfunc = "sum"))
    author = list(pa_1.index)
    pa_1["Affiliation"]


In [12]:
list(pa_1.loc[pa_1["Affiliation"]!= "","Affiliation"])

['Chonbuk National University Hospital, Jeonju, Korea',
 'Department of Translational Oncology, National Center for Tumor Diseases and German Cancer Research Center (DKFZ)',
 'Istituto di Ricerche Farmacologiche â€˜Mario Negriâ€™, Via La Masa 19, 20156 Milano, ItalyMario Negri Institute Pharmacology Research',
 'Human Computer Interaction Institute and Institute for Complex Engineered Systems|Carnegie Mellon University',
 'Coordinated Science Laboratory|Department of Electrical and Computer Engineering|University of Illinois at Urbana-Champaign',
 'Department of Mathematics, MIT, Cambridge, MA 02139, USA',
 'Coordinated Science Laboratory|University of Illinois at Urbana-Champaign',
 'Institute Instrumentation Center and Centre of Nanotechnology, Indian Institute of Technology Roorkee, Roorkee-247667, India',
 'Information Security Institution|Sichuan University',
 'Department of Translational Oncology, National Center for Tumor Diseases and German Cancer Research Center (DKFZ)',
 'Dep

In [15]:
list(pa.index())

TypeError: 'RangeIndex' object is not callable

In [25]:
dict_paper = paper_keywords(dataset)

Start title!!!
Start keyword!!!
Start concatenation!!!


In [26]:
dict_paper

defaultdict(list,
            {1: ['mobil',
              'video',
              'captur',
              'real',
              'time',
              'video',
              'stitch',
              'stitch',
              'video',
              'stream',
              'mobil',
              'phone'],
             2: ['nonloc',
              'diffus',
              'asymptot',
              'behaviour',
              'nonloc',
              'equat'],
             3: ['area', 'effect', 'cepaea'],
             4: ['multipl',
              'patern',
              'natur',
              'popul',
              'salamand',
              'sperm',
              'storag'],
             5: ['complex', 'find', 'short', 'resolut', 'proof'],
             6: ['softwar',
              'system',
              'larg',
              'dynam',
              'map',
              'base',
              'network',
              'geograph',
              'databas'],
             7: ['natur', 'histori', 'psychoneu

In [18]:
dataset['paper']

Unnamed: 0,Id,Title,Year,ConferenceId,JournalId,Keyword
0,1,Stitching videos streamed by mobile phones in ...,2009,167,0,mobile video capturing|real-time|video stitching
1,2,A nonlocal convection–diffusion equation,2007,0,7234,Nonlocal diffusion; Convection–diffusion; Asym...
2,3,Area Effects in Cepaea,1963,0,16867,
3,4,Multiple paternity in a natural population of ...,2005,0,6130,
4,5,Complexity of Finding Short Resolution Proofs,1997,158,0,
5,6,A software system for large dynamic maps based...,2001,341,0,
6,7,Natural History of the Psychoneuroses,1959,0,17639,
7,8,145 GROWTH HORMONE RECEPTORS AND THE ONSET OF ...,1985,0,4084,
8,9,Hypermethylation of the <I>TPEF/HPP1</I> Gene ...,2005,0,3943,"Keywords: methylation, epigenetic, metastasis,..."
9,10,RUC Short-Range Ensemble Forecast System,0,0,0,
