In [4]:
import pandas as pd
import numpy
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
stopword = set(stopwords.words('english'))
porter = PorterStemmer()
paper = pd.read_csv(r"C:\Users\Admin\Downloads\KDD-Cup\dataRev\Paper.csv")[:2000]
paper= paper.set_index("Id")
paper['Keyword']= paper['Keyword'].fillna("")
paper['Title']= paper['Title'].fillna("")
title = list(paper["Title"])
paper.head()

Unnamed: 0_level_0,Title,Year,ConferenceId,JournalId,Keyword
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Stitching videos streamed by mobile phones in ...,2009,167,0,mobile video capturing|real-time|video stitching
2,A nonlocal convection–diffusion equation,2007,0,7234,Nonlocal diffusion; Convection–diffusion; Asym...
3,Area Effects in Cepaea,1963,0,16867,
4,Multiple paternity in a natural population of ...,2005,0,6130,
5,Complexity of Finding Short Resolution Proofs,1997,158,0,


In [5]:
word_tokenize(paper.loc[1,'Keyword'])

['mobile', 'video', 'capturing|real-time|video', 'stitching']

In [6]:
#preprocessing keywords
def filter_keyword(text):
    for i in string.punctuation: text = text.replace(i,' ')
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words = [w for w in words if w not in  ["keywords"] ]
    stemmed = [porter.stem(w) for w in words]
    return stemmed
paper['Keyword_pro'] = paper['Keyword'].map(filter_keyword)
    

In [7]:


def tokenize(text):
    #text = text.split() # get single words 
    #table = maketrans('','',string.punctuation)
    #stripped = [w.translate(table).lower() for w in text]#get rid of all punctuation
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words =[w for w in words if  not w in stopword]
    stemmed = [porter.stem(w) for w in words]
    return stemmed
paper['Token'] = paper.Title.map(tokenize)


In [8]:
#concatenate keyword and token
paper['Key_token'] = paper[['Keyword_pro','Token']].apply((lambda x: ' '.join(list(set([i for z in x for i in z])))), axis =1)

In [9]:
token = list(paper['Key_token'])
count = CountVectorizer(min_df = 5)
tfidf = TfidfTransformer()
count_token =count.fit_transform(token).toarray() #2000*527
#tfid_token = tfidf.fit_transform(count_token)
vocab = list(count.vocabulary_.keys())



In [10]:
#list of common words in each title of each document
paper['Common word'] = paper['Key_token'].map(lambda x: [i for i in x.split() if i in vocab])
paper.loc[:5,'Common word']
    

Id
1    [video, mobil, real, stream, time]
2                       [diffus, equat]
3                        [area, effect]
4       [natur, storag, popul, multipl]
5              [find, resolut, complex]
Name: Common word, dtype: object

In [11]:
paper.head()

Unnamed: 0_level_0,Title,Year,ConferenceId,JournalId,Keyword,Keyword_pro,Token,Key_token,Common word
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Stitching videos streamed by mobile phones in ...,2009,167,0,mobile video capturing|real-time|video stitching,"[mobil, video, captur, real, time, video, stitch]","[stitch, video, stream, mobil, phone]",video phone mobil real stream time captur stitch,"[video, mobil, real, stream, time]"
2,A nonlocal convection–diffusion equation,2007,0,7234,Nonlocal diffusion; Convection–diffusion; Asym...,"[nonloc, diffus, asymptot, behaviour]","[nonloc, equat]",asymptot diffus behaviour nonloc equat,"[diffus, equat]"
3,Area Effects in Cepaea,1963,0,16867,,[],"[area, effect, cepaea]",area effect cepaea,"[area, effect]"
4,Multiple paternity in a natural population of ...,2005,0,6130,,[],"[multipl, patern, natur, popul, salamand, sper...",sperm salamand natur storag popul multipl patern,"[natur, storag, popul, multipl]"
5,Complexity of Finding Short Resolution Proofs,1997,158,0,,[],"[complex, find, short, resolut, proof]",short proof find resolut complex,"[find, resolut, complex]"


In [12]:
#similarity between two document based on their keywords
def paper_common_word(id1, id2):
    sim = 0
    word1 = paper.loc[id1,'Common word']
    word2 = paper.loc[id2, 'Common word']
    for i in word1:
        if i in word2:
            sim += 1
    return sim

In [42]:
years = list(set(paper['Year']))
print (years)

[0, 1920, 1922, 1926, 1934, 1946, 1949, 1952, 1953, 1954, 1956, 1957, 1959, 1960, 1961, 1962, 1963, 1964, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1918, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 1889, -1]


In [49]:
#similarity between two documents based on their pulished year
def paper_year(id1, id2):
    year1 = paper.loc[id1,'Year']
    year2 = paper.loc[id2, 'Year']
    if (year1 == 0 | year2 == 0 | year1 == -1 | year2 == -1 ): return 0
    return 1/(1+ abs(year1 - year2 ) )# can be adjusted to find the best formula

In [52]:
word_weight =1
year_weight = 1
journal_weight =1
conference_weight =1
def paper_sim(id1, id2):
    word_sim = paper_common_word(id1, id2)
    year_sim = paper_year(id1, id2)
    journal_sim = paper.loc[id1,'JournalId'] == paper.loc[id2, 'JournalId']
    conference_sim = paper.loc[id1,'ConferenceId'] == paper.loc[id2, 'ConferenceId']
    return word_sim* word_weight + year_weight* year_sim +journal_sim * journal_weight + conference_sim * conference_weight


In [53]:
paper_sim(5,45)

0.20000000000000001