In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
information_train = pd.read_csv('train/information_train.csv',delimiter='\t')
train = pd.read_csv('train/train.csv')

In [3]:
set_14_train = information_train[information_train.set==14]
set_14_train.reset_index(drop=True, inplace=True)
set_14_train = set_14_train.drop(['full_Text','author_str','set'], 1)
set_14_train.head()

Unnamed: 0,abstract,article_title,pmid,pub_date
0,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21
1,Here we present the first diploid genome seque...,The diploid genome sequence of an Asian indivi...,18987735,2009-07-21
2,Following recent success in genome-wide associ...,Deletion polymorphism upstream of IRGM associa...,19165925,2009-07-30
3,Bowtie: a new ultrafast memory-efficient tool ...,Ultrafast and memory-efficient alignment of sh...,19261174,2009-12-01
4,Motivation: A new protocol for sequencing the ...,TopHat: discovering splice junctions with RNA-Seq,19289445,2009-05-01


In [12]:
tfidf_vectorizer_title = TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english',norm='l2',max_features=50)
X_title = tfidf_vectorizer_title.fit_transform(set_14_train['article_title'])

In [9]:
tfidf_vectorizer_abstract = TfidfVectorizer(analyzer='word',ngram_range=(1,2),stop_words='english',norm='l2',max_features=200)
X = tfidf_vectorizer_abstract.fit_transform(set_14_train['abstract'])

In [13]:
X_title[0]

<1x50 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [14]:
def merging_data_frame(particular_row_series,df_remaining_rows):
    
    duplicate_frequency =df_remaining_rows.shape[0] - 1
    df_remaining_rows = df_remaining_rows.rename(columns={'abstract': 'abstract_1', 'article_title': 'article_title_1','pmid': 'pmid_1','pub_date': 'pub_date_1'})
    df_particular_row = pd.DataFrame(particular_row_series).transpose()
    df_particular_row = df_particular_row.append([df_particular_row]*duplicate_frequency,ignore_index=True)
    

    df_particular_row.reset_index(drop=True, inplace=True)
    df_remaining_rows.reset_index(drop=True, inplace=True)

    result = pd.concat([df_particular_row, df_remaining_rows], axis=1)
    master_frames.append(result)
    

In [15]:
master_frames = []
for index,row in set_14_train.iterrows():
    merging_data_frame(set_14_train.iloc[index],set_14_train.drop(set_14_train.index[[index]]))
combined_df = pd.concat(master_frames,ignore_index=True)

combined_df.pub_date = pd.to_datetime(combined_df.pub_date)
combined_df.pub_date_1 = pd.to_datetime(combined_df.pub_date_1)

combined_df['differnce_days'] = combined_df['pub_date'] - combined_df['pub_date_1']
combined_df['differnce_days'] = combined_df['differnce_days'] / (np.timedelta64(1, 'D'))

combined_df = combined_df[combined_df > 0].dropna()
combined_df.reset_index(drop=True,inplace=True)

combined_df.head()


Unnamed: 0,abstract,article_title,pmid,pub_date,abstract_1,article_title_1,pmid_1,pub_date_1,differnce_days
0,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,Motivation: A new protocol for sequencing the ...,TopHat: discovering splice junctions with RNA-Seq,19289445,2009-05-01,20.0
1,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,The recycling of the amyloid precursor protein...,The neuronal sortilin-related receptor SORL1 i...,17220890,2009-01-16,125.0
2,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,There is increasing evidence that genome-wide ...,"Genome-wide association study of 14,000 cases ...",17554300,2009-05-20,1.0
3,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,The Wellcome Trust Case Control Consortium (WT...,Robust associations of four new chromosome reg...,17554260,2007-06-21,700.0
4,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,TAR DNA-binding protein 43 (TDP-43) is a major...,TDP-43 in Familial and Sporadic Frontotemporal...,17591968,2007-07-01,690.0


In [16]:
def cosine_similarity_abstract(row):
    
    index_1 = set_14_train.pmid[set_14_train.pmid == row['pmid']].index.tolist()[0]
    index_2 = set_14_train.pmid[set_14_train.pmid == row['pmid_1']].index.tolist()[0]
    similarity_score = cosine_similarity(X[index_1],X[index_2])
    return similarity_score[0][0]
    

In [17]:
def cosine_similarity_title(row):
    
    index_1 = set_14_train.pmid[set_14_train.pmid == row['pmid']].index.tolist()[0]
    index_2 = set_14_train.pmid[set_14_train.pmid == row['pmid_1']].index.tolist()[0]
    similarity_score = cosine_similarity(X_title[index_1],X_title[index_2])
    return similarity_score[0][0]
    

In [18]:
combined_df['similarity_title'] = combined_df.apply(cosine_similarity_abstract,axis=1)
combined_df.head()

Unnamed: 0,abstract,article_title,pmid,pub_date,abstract_1,article_title_1,pmid_1,pub_date_1,differnce_days,similarity_title
0,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,Motivation: A new protocol for sequencing the ...,TopHat: discovering splice junctions with RNA-Seq,19289445,2009-05-01,20.0,0.152535
1,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,The recycling of the amyloid precursor protein...,The neuronal sortilin-related receptor SORL1 i...,17220890,2009-01-16,125.0,0.07212
2,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,There is increasing evidence that genome-wide ...,"Genome-wide association study of 14,000 cases ...",17554300,2009-05-20,1.0,0.337733
3,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,The Wellcome Trust Case Control Consortium (WT...,Robust associations of four new chromosome reg...,17554260,2007-06-21,700.0,0.138157
4,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,TAR DNA-binding protein 43 (TDP-43) is a major...,TDP-43 in Familial and Sporadic Frontotemporal...,17591968,2007-07-01,690.0,0.073892


In [111]:
combined_df['similarity_abstract'] = combined_df.apply(cosine_similarity_title,axis=1)
combined_df.head()

Unnamed: 0,abstract,article_title,pmid,pub_date,abstract_1,article_title_1,pmid_1,pub_date_1,differnce_days,similarity_abstract
0,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,Motivation: A new protocol for sequencing the ...,TopHat: discovering splice junctions with RNA-Seq,19289445,2009-05-01,20.0,0.015609
1,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,The recycling of the amyloid precursor protein...,The neuronal sortilin-related receptor SORL1 i...,17220890,2009-01-16,125.0,0.011674
2,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,There is increasing evidence that genome-wide ...,"Genome-wide association study of 14,000 cases ...",17554300,2009-05-20,1.0,0.049808
3,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,The Wellcome Trust Case Control Consortium (WT...,Robust associations of four new chromosome reg...,17554260,2007-06-21,700.0,0.034388
4,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,TAR DNA-binding protein 43 (TDP-43) is a major...,TDP-43 in Familial and Sporadic Frontotemporal...,17591968,2007-07-01,690.0,0.011453


In [141]:
def is_refernce(row):
    index_train = train.pmid[train.pmid == row['pmid']].index.tolist()[0]
    references_string = train.iloc[index_train]['ref_list']
    references_string_clean = references_string.replace('[','').replace(']','').replace('\'','').split(sep=',')
    references_list = list(map(int, references_string_clean))
    
    if row['pmid_1'] in references_list:
        return 1
    return 0
    

In [140]:
combined_df['is_reference'] = combined_df.apply(is_refernce,axis=1)
combined_df.head()

Unnamed: 0,abstract,article_title,pmid,pub_date,abstract_1,article_title_1,pmid_1,pub_date_1,differnce_days,similarity_abstract,is_reference
0,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,Motivation: A new protocol for sequencing the ...,TopHat: discovering splice junctions with RNA-Seq,19289445,2009-05-01,20.0,0.015609,0
1,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,The recycling of the amyloid precursor protein...,The neuronal sortilin-related receptor SORL1 i...,17220890,2009-01-16,125.0,0.011674,0
2,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,There is increasing evidence that genome-wide ...,"Genome-wide association study of 14,000 cases ...",17554300,2009-05-20,1.0,0.049808,1
3,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,The Wellcome Trust Case Control Consortium (WT...,Robust associations of four new chromosome reg...,17554260,2007-06-21,700.0,0.034388,0
4,"We describe the Phase II HapMap, which charact...",A second generation human haplotype map of ove...,17943122,2009-05-21,TAR DNA-binding protein 43 (TDP-43) is a major...,TDP-43 in Familial and Sporadic Frontotemporal...,17591968,2007-07-01,690.0,0.011453,0


In [144]:
combined_df_for_model = combined_df[['pmid','pmid_1','differnce_days','similarity_abstract','similarity_title','is_reference']]
combined_df_for_model.head()

Unnamed: 0,pmid,pmid_1,differnce_days,similarity_abstract,similarity_title,is_reference
0,17943122,19289445,20.0,0.015609,0.015609,0
1,17943122,17220890,125.0,0.011674,0.011674,0
2,17943122,17554300,1.0,0.049808,0.049808,1
3,17943122,17554260,700.0,0.034388,0.034388,0
4,17943122,17591968,690.0,0.011453,0.011453,0


In [None]:
predictors = combined_df_for_model.select_dtypes(['float64', 'int64', 'uint8']).columns.drop('is_reference')

import xgboost as xgb

y_train = combined_df_for_model.is_reference

xgb_params = {
   'eta': 0.3,
   'max_depth': 5,
   'subsample': 0.6,
   'colsample_bytree': 1,
   'lambda': 3,
   'objective': 'binary:logistic',
   'eval_metric': 'auc',
   'silent': 0
}

dtrain = xgb.DMatrix(data=combined_df_for_model[predictors], label= y_train)


num_rounds = 10000

model_cv = xgb.cv(xgb_params, dtrain, num_rounds, nfold=10, early_stopping_rounds=20, verbose_eval=1)
model = xgb.train(xgb_params, dtrain, num_boost_round = 44)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1, 1, figsize=(16, 16))
xgb.plot_importance(model, height=0.5, ax=ax)

In [None]:
predictors = combined_df.select_dtypes(['float64', 'int64', 'uint8']).columns.drop('is_reference').drop('pmid_1')

import xgboost as xgb

y_train = combined_df.is_reference

xgb_params = {
   'eta': 0.3,
   'max_depth': 5,
   'subsample': 0.6,
   'colsample_bytree': 1,
   'lambda': 3,
   'objective': 'binary:logistic',
   'eval_metric': 'auc',
   'silent': 0
}

dtrain = xgb.DMatrix(data=combined_df[predictors], label= y_train)


num_rounds = 10000

model_cv = xgb.cv(xgb_params, dtrain, num_rounds, nfold=10, early_stopping_rounds=20, verbose_eval=1)
model = xgb.train(xgb_params, dtrain, num_boost_round = 44)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1, 1, figsize=(16, 16))
xgb.plot_importance(model, height=0.5, ax=ax)\\\\\\\\\\\\\\\\\\\\\\\\\\\\