In [1]:
import pickle
import data_io
import pandas as pd
import numpy as np
from feature_set import *
from collections import defaultdict
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
stopword = set(stopwords.words('english'))
porter = PorterStemmer()

#from Thao_features import *
dataset = {}
dataset['paper'] = pd.read_csv('dataRev2/Paper.csv')[:2000]
trainset = pd.read_csv('dataRev2/Train.csv')
train_confirmed = trainset[['AuthorId', 'ConfirmedPaperIds']].rename(columns = {'ConfirmedPaperIds':'PaperIds'})
targetset = train_confirmed
dataset['paper_author'] = pd.read_csv('dataRev2/PaperAuthor1.csv')

In [2]:
#for keyword of Papers
def filter_keyword(text):
    for i in string.punctuation: text = text.replace(i,' ')
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words = [w for w in words if w not in  ["keywords"] ]
    stemmed = [porter.stem(w) for w in words]
    return stemmed

# for title of papers
def tokenize(text):
    words = word_tokenize(text.decode("utf8")) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words =[w for w in words if  not w in stopword]
    stemmed = [porter.stem(w) for w in words]
    return stemmed

#return the keywords of each paper
def paper_keywords(data):
    paper = data['paper']
    paperid = list(paper["Id"])
    paper_keyword = defaultdict(list)

    paper= paper.set_index("Id")
    paper['Keyword']= paper['Keyword'].fillna("")
    paper['Title']= paper['Title'].fillna("")
    title = list(paper["Title"])
    paper['Token'] = paper.Title.map(tokenize)
    paper['Keyword_pro'] = paper['Keyword'].map(filter_keyword)

    #concatenate keyword and token
    paper['Key_token'] = paper[['Keyword_pro','Token']].apply((lambda x: ' '.join(list(set([i for z in x for i in z])))), axis =1)
    token = list(paper['Key_token'])
    count = CountVectorizer(min_df = 5) #only take words with df > 5
    tfidf = TfidfTransformer()
    count_token =count.fit_transform(token).toarray() #2000*527
    #tfid_token = tfidf.fit_transform(count_token)
    vocab = list(count.vocabulary_.keys())
    #list of common words in each title of each document
    paper['Common word'] = paper['Key_token'].map(lambda x: [i for i in x.split() if i in vocab])
    for i in paperid:
        paper_keyword[i] = paper.loc[i,'Common word']
    return paper_keyword

#how similar two documents are based on keywords
def paper_common_word(data, id1, id2):
    paper_keyword = paper_keywords(data)
    sim = 0
    word1 = paper_keyword[id1]
    word2 = paper_keyword[id2]
    for i in word1:
        if i in word2:
            sim += 1
    return sim


def target_paper_and_papers_of_target_author_by_keywords(dataset,author_paper_pairs):
    paper_sim = defaultdict(int)
    trainset = dataset['paper_author']
   
    for i in author_paper_pairs:
        trained_paper= list(trainset.loc[trainset["AuthorId"]== i[0], "PaperId"])
        paper_sim[i] = sum(paper_common_word(dataset,i[1],j) for j in trained_paper)
    return paper_sim

def parse_paper_ids(paper_ids_string):
    return paper_ids_string.strip().split()

def parse_targetset(targetset):
    pair_list = []
    author_id_list = targetset['AuthorId']

    for i in range(len(author_id_list)):
        author_id = author_id_list[i]
        papers = targetset[targetset.AuthorId == author_id]['PaperIds'].unique()[0]
        papers = parse_paper_ids(papers)
        for j in range(len(papers)):
            paper_id = int(papers[j])
            pair_list.append( (author_id, paper_id) )
    return list(set(pair_list))

In [3]:
author_paper_pairs = parse_targetset(targetset)

In [None]:
thao_f3 = target_paper_and_confirmed_papers_of_target_author_by_keywords(dataset,author_paper_pairs)