In [141]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import xgboost as xgb
import pickle
from sklearn.linear_model import LogisticRegression

In [2]:
information_train = pd.read_csv('train/information_train.csv',delimiter='\t')
train = pd.read_csv('train/train.csv',delimiter=',')

In [20]:
#Fitting TFIDF Vectorizer over complete corpus
tfidf_vectorizer_title = TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english',norm='l2',max_features=50)
title_vectors = tfidf_vectorizer_title.fit_transform(information_train['article_title'])

tfidf_vectorizer_abstract = TfidfVectorizer(analyzer='word',ngram_range=(1,2),stop_words='english',norm='l2',max_features=200)
abstract_vectors = tfidf_vectorizer_abstract.fit_transform(information_train['abstract'])

In [7]:
train_set_numbers = list(information_train.set.value_counts().index)
# test_set_numbers = list(information_test.set.value_counts().index)

In [28]:
def get_set_wise_data(set_number):
    information_for_this_set = information_train[information_train.set == set_number]
    information_for_this_set = information_for_this_set.head(20)
    information_for_this_set.reset_index(drop=True,inplace=True)
    information_for_this_set = information_for_this_set[['abstract','article_title','pmid','pub_date']]

    return information_for_this_set

In [29]:
data_set_wise_train = {}
for each_set in train_set_numbers:
    data_set_wise_train[each_set] = get_set_wise_data(each_set)

In [30]:
data_set_wise_train[16].head()

Unnamed: 0,abstract,article_title,pmid,pub_date
0,1996 is polio awareness year. This paper revie...,Poliomyelitis.,8944203,1996-11-01
1,The aim of this study was to determine the app...,General primer-mediated polymerase chain react...,1370845,1992-01-01
2,We describe a rapid method for extraction and ...,Rapid diagnosis of enterovirus infection by ma...,8380182,1993-01-01
3,"We have developed a simple, rapid, and reliabl...",Rapid and simple method for purification of nu...,1691208,1990-03-01
4,Enteroviruses were specifically detected in cr...,Specific detection of enteroviruses in clinica...,2155917,1990-02-01


In [31]:
def merging_data_frame(particular_row_series,df_remaining_rows):
    
    duplicate_frequency = df_remaining_rows.shape[0] - 1
    df_remaining_rows = df_remaining_rows.rename(columns={'abstract': 'abstract_1', 'article_title': 'article_title_1','pmid': 'pmid_1','pub_date': 'pub_date_1'})
    df_particular_row = pd.DataFrame(particular_row_series).transpose()
    df_particular_row = df_particular_row.append([df_particular_row]*duplicate_frequency,ignore_index=True)
    

    df_particular_row.reset_index(drop=True, inplace=True)
    df_remaining_rows.reset_index(drop=True, inplace=True)

    result = pd.concat([df_particular_row, df_remaining_rows], axis=1)
    return result
#     master_frames.append(result)
    

In [32]:
combined_data_train = {}
for each_set in train_set_numbers:
    master_frames = []
    this_set_data = data_set_wise_train[each_set]
    for index,row in this_set_data.iterrows():
        master_frames.append(merging_data_frame(this_set_data.iloc[index],this_set_data.drop(this_set_data.index[[index]])))
    
    combined_df = pd.concat(master_frames,ignore_index=True)
    combined_df.pub_date = pd.to_datetime(combined_df.pub_date)
    combined_df.pub_date_1 = pd.to_datetime(combined_df.pub_date_1)
    combined_df['differnce_days'] = combined_df['pub_date'] - combined_df['pub_date_1']
    combined_df['differnce_days'] = combined_df['differnce_days'] / (np.timedelta64(1, 'D'))
    combined_df = combined_df[combined_df > 0].dropna()
    combined_df.reset_index(drop=True,inplace=True)

    combined_data_train[each_set] = combined_df


In [43]:
def cosine_similarity_abstract(row):
    
    index_1 = information_train.pmid[information_train.pmid == row['pmid']].index.tolist()[0]
    index_2 = information_train.pmid[information_train.pmid == row['pmid_1']].index.tolist()[0]
    similarity_score = cosine_similarity(abstract_vectors[index_1],abstract_vectors[index_2])
    return similarity_score[0][0]
    
def cosine_similarity_title(row):
    
    index_1 = information_train.pmid[information_train.pmid == row['pmid']].index.tolist()[0]
    index_2 = information_train.pmid[information_train.pmid == row['pmid_1']].index.tolist()[0]
    similarity_score = cosine_similarity(title_vectors[index_1],title_vectors[index_2])
    return similarity_score[0][0]

In [44]:
for each_set in train_set_numbers:
    combined_data_train[each_set]['title_similarity'] = combined_data_train[each_set].apply(cosine_similarity_title,axis=1)
    combined_data_train[each_set]['abstract_similarity'] = combined_data_train[each_set].apply(cosine_similarity_abstract,axis=1)

In [47]:
combined_data_train[16].head(2)

Unnamed: 0,abstract,article_title,pmid,pub_date,abstract_1,article_title_1,pmid_1,pub_date_1,differnce_days,title_similarity,abstract_similarity
0,1996 is polio awareness year. This paper revie...,Poliomyelitis.,8944203,1996-11-01,The aim of this study was to determine the app...,General primer-mediated polymerase chain react...,1370845,1992-01-01,1766.0,0.0,0.063952
1,1996 is polio awareness year. This paper revie...,Poliomyelitis.,8944203,1996-11-01,We describe a rapid method for extraction and ...,Rapid diagnosis of enterovirus infection by ma...,8380182,1993-01-01,1400.0,0.0,0.113354


In [48]:
def is_refernce(row):
    index_train = train.pmid[train.pmid == row['pmid']].index.tolist()[0]
    references_string = train.iloc[index_train]['ref_list']
    references_string_clean = references_string.replace('[','').replace(']','').replace('\'','').split(sep=',')
    references_list = list(map(int, references_string_clean))
    
    if row['pmid_1'] in references_list:
        return 1
    return 0

In [49]:
for each_set in train_set_numbers:
    combined_data_train[each_set]['is_reference'] = combined_data_train[each_set].apply(is_refernce,axis=1)

In [50]:
combined_data_train[16].head()

Unnamed: 0,abstract,article_title,pmid,pub_date,abstract_1,article_title_1,pmid_1,pub_date_1,differnce_days,title_similarity,abstract_similarity,is_reference
0,1996 is polio awareness year. This paper revie...,Poliomyelitis.,8944203,1996-11-01,The aim of this study was to determine the app...,General primer-mediated polymerase chain react...,1370845,1992-01-01,1766.0,0.0,0.063952,1
1,1996 is polio awareness year. This paper revie...,Poliomyelitis.,8944203,1996-11-01,We describe a rapid method for extraction and ...,Rapid diagnosis of enterovirus infection by ma...,8380182,1993-01-01,1400.0,0.0,0.113354,1
2,1996 is polio awareness year. This paper revie...,Poliomyelitis.,8944203,1996-11-01,"We have developed a simple, rapid, and reliabl...",Rapid and simple method for purification of nu...,1691208,1990-03-01,2437.0,0.0,0.0,0
3,1996 is polio awareness year. This paper revie...,Poliomyelitis.,8944203,1996-11-01,Enteroviruses were specifically detected in cr...,Specific detection of enteroviruses in clinica...,2155917,1990-02-01,2465.0,0.0,0.050657,0
4,1996 is polio awareness year. This paper revie...,Poliomyelitis.,8944203,1996-11-01,Enteroviruses are among the most common causes...,Enzymatic RNA amplification of the enteroviruses.,2157735,1990-03-01,2437.0,0.0,0.0,0


In [144]:
predictors = combined_data_train[16].select_dtypes(['float64', 'int64', 'uint8']).columns.drop('is_reference').drop('pmid_1')
predictors

Index(['differnce_days', 'title_similarity', 'abstract_similarity'], dtype='object')

In [None]:
predictors = combined_data_train[16].select_dtypes(['float64', 'int64', 'uint8']).columns.drop('is_reference').drop('pmid_1')


y_train = combined_data_train[16].is_reference

xgb_params = {
    'eta': 0.3,
    'max_depth': 5,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'lambda': 3,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0
}

dtrain = xgb.DMatrix(data=combined_data_train[16][predictors], label= y_train)


num_rounds = 10000

model_cv = xgb.cv(xgb_params, dtrain, num_rounds, nfold=10, early_stopping_rounds=20, verbose_eval=1)
model = xgb.train(xgb_params, dtrain, num_boost_round = 44)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1, 1, figsize=(16, 16))
xgb.plot_importance(model, height=0.5, ax=ax)

In [155]:
def building_models(set_number):
    this_set_data = combined_data_train[set_number]
    logisticRegr = LogisticRegression()
    
    x_train = this_set_data[['title_similarity','abstract_similarity','differnce_days']]
    y_train = this_set_data['is_reference']
    logisticRegr.fit(x_train, y_train)
    
    filename = 'model_'+str(set_number)+'_.sav'
    pickle.dump(logisticRegr, open(filename, 'wb'))

    

In [142]:
filename = 'finalized_model.sav'
pickle.dump(logisticRegr, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_train,y_train)
print(result)

0.824468085106


In [124]:
def get_references_list(set_number):
    
    data_with_reference_1 = combined_data_train[set_number][combined_data_train[set_number].is_reference== 1]
    data_with_reference_1 = data_with_reference_1[['pmid','pmid_1']]
    data_with_references_series = data_with_reference_1.groupby('pmid')['pmid_1'].apply(list)
    data_with_references_data_frame = pd.DataFrame({'pmid':data_with_references_series.index, 'references':data_with_references_series.values})
    
    return data_with_references_data_frame

In [126]:
aggregate_references_list = {}
for each_set in train_set_numbers:
    aggregate_references_list[each_set] = get_references_list(each_set)

In [127]:
aggregate_references_list[14].head()

Unnamed: 0,pmid,references
0,17003490,[1713021]
1,17554300,[17554260]
2,17579875,"[17591968, 16718704, 17003490]"
3,17591968,[17003490]
4,17943122,[17554300]


In [125]:
sample_df = combined_data_train[16][combined_data_train[16].is_reference== 1]
sample_df = sample_df[['pmid','pmid_1']]
sample_df = sample_df.groupby('pmid')['pmid_1'].apply(list)
result = pd.DataFrame({'pmid':sample_df.index, 'references':sample_df.values})
result.head()