In [1]:
import numpy as np
import pandas as pd
import gensim
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from nltk.corpus import stopwords

### Loading Word2Vec Model

In [2]:
%%time
model = gensim.models.KeyedVectors.load_word2vec_format('Data/PubMed-w2v.bin', 
                                                         binary=True)

FileNotFoundError: [Errno 2] No such file or directory: 'Data/PubMed-w2v.bin'

In [10]:
vocab = model.vocab.keys()
print(len(vocab))

2351706


### Loading Required Files & Tables

In [15]:
nct_ids_by_year = pickle.load(open("Data/ids_by_year_fda_reg_with_pos.pkl","rb"))

In [None]:
studies = pd.read_csv('Data/Studies.csv',
                      index_col=0)

In [12]:
studies_sel = studies[studies['is_fda_regulated_drug']==True].copy()
print(studies_sel.shape)

(13646, 64)


In [13]:
studies_sel['study_first_submitted_date'] = pd.to_datetime(studies_sel['study_first_submitted_date'],
                                                           format='%Y-%m-%d')

In [14]:
studies_sel = studies_sel[studies_sel['study_first_submitted_date'].dt.year.isin(range(2014,2019))]
print(studies_sel.shape)

(11484, 64)


### Word2Vec Similarity Matrix Helper Functions

In [24]:
stop_list = stopwords.words('english')
stop_list.extend(("a", "an","the","with","ii","iii","iv","non",
                  "studies","use","study","multiple", "single", "double",
                  "this","these","those","care","effect","health","patients",
                  "trial","treatment","versus","clinical","clinic","controlled","control"))

In [17]:
def nlp_pipeline(series,stop_list):
    series = series.apply(lambda x: x.strip().lower().split(' '))
    series = series.apply(lambda x: [elem.strip(string.punctuation) for elem in x])
    series = series.apply(lambda x: [elem.strip() for elem in x if (elem not in stop_list) and (elem!='')])
    return series

In [54]:
def gen_vec_from_list(str_list,model):
    vec_list=[]
    elem_list = []
    for elem in str_list:
        try:
            curr_vec = model[elem]
            vec_list.append(curr_vec)
            elem_list.append(elem)
        except:
            continue
    return pd.DataFrame(data=vec_list,
                        index=elem_list).apply(np.mean,
                                               axis=0).values

### word2vec based similarity from `official_title`

In [106]:
official_title_series = studies_sel['official_title'].copy()
official_title_series.shape

(11484,)

In [107]:
official_title_series.fillna('',inplace=True)

In [108]:
official_title_series = nlp_pipeline(series=official_title_series,
                                     stop_list=stop_list)

In [112]:
%%time
official_title_vec_series = official_title_series.apply(lambda x: gen_vec_from_list(str_list=x,
                                                                                    model=model))

CPU times: user 4min 21s, sys: 950 ms, total: 4min 22s
Wall time: 4min 23s


In [131]:
#handling null vectors
official_title_vec_series = official_title_vec_series.apply(lambda x: np.array(np.zeros(shape=(200,))) if len(x)==0 else x)
official_title_vec_series.shape

(11484,)

In [154]:
official_title_vec_df = pd.DataFrame.from_items(zip(official_title_vec_series.index, 
                                                    official_title_vec_series.values))
official_title_vec_df.shape

(200, 11484)

In [161]:
official_title_vec_df = official_title_vec_df.T

In [33]:
chosen_ids = studies_sel['nct_id'].tolist()

In [140]:
official_title_series.index = chosen_ids
official_title_vec_series.index = chosen_ids
official_title_vec_df.index = chosen_ids

In [29]:
id_list_ordered_by_year = {year:[elem[0] for elem in sorted(nct_ids_by_year[year].items(),
                                                            key=lambda kv:kv[1])] for year in range(2014,2019)}

In [30]:
year_list = list(range(2014,2019))

In [166]:
%%time

word2vec_off_tit_sim_mat_by_year = {}
for year in year_list:
    word2vec_off_tit_sim_mat_by_year[year] = pd.DataFrame(data=cosine_similarity(official_title_vec_df.loc[id_list_ordered_by_year[year]]),
                                                          index=id_list_ordered_by_year[year],
                                                          columns=id_list_ordered_by_year[year])

CPU times: user 814 ms, sys: 183 ms, total: 997 ms
Wall time: 641 ms


In [167]:
[word2vec_off_tit_sim_mat_by_year[year].shape for year in year_list]

[(709, 709), (963, 963), (1643, 1643), (4750, 4750), (3419, 3419)]

In [180]:
# pickle.dump(word2vec_off_tit_sim_mat_by_year,
#             open('Output/sim_mat_off_t_word2vec.pkl',"wb"),
#             protocol=2)

### word2vec based similarity from `keywords`

In [27]:
keywords = pd.read_csv('/Users/harsheelsoin/Documents/Capstone_Project_Goldman_Sachs/Data/data_by_tables/Keywords.csv',
                       index_col=0)
keywords.shape

(816715, 4)

In [67]:
keywords_sel = keywords[keywords['nct_id'].isin(chosen_ids)].copy()
keywords_sel.shape

(29631, 4)

In [69]:
keywords_sel = keywords_sel.groupby('nct_id').apply(lambda x: x['downcase_name'].tolist()).reset_index()
keywords_sel.shape

(6275, 2)

In [70]:
keywords_sel.columns = ['nct_id','keywords']

In [71]:
keywords_sel['keywords'] = keywords_sel['keywords'].apply(lambda x: ' '.join(x))

In [72]:
keywords_sel.set_index('nct_id',inplace=True)

In [73]:
keywords_sel = pd.Series(data=keywords_sel['keywords'].values,
                         index=keywords_sel.index)
keywords_sel.shape

(6275,)

In [75]:
keywords_sel = nlp_pipeline(series=keywords_sel,
                            stop_list=stop_list)

In [77]:
%%time
keywords_sel = keywords_sel.apply(lambda x: gen_vec_from_list(str_list=x,
                                                              model=model))

CPU times: user 2min 28s, sys: 1.06 s, total: 2min 29s
Wall time: 2min 32s


In [86]:
all_ids = []
for year in year_list:
    all_ids += id_list_ordered_by_year[year]

len(all_ids)

11484

In [110]:
keywords_sel_all_ids = pd.concat(objs=[pd.Series(data=all_ids,
                                                 index=all_ids),
                                       keywords_sel],
                                 axis=1)
keywords_sel_all_ids.shape

(11484, 2)

In [111]:
keywords_sel_all_ids.drop(labels=[0],
                          axis=1,
                          inplace=True)

In [112]:
#addressing null vectors
keywords_sel_all_ids[1] = keywords_sel_all_ids[1].apply(lambda x: np.array(np.zeros(shape=(200,))) if ((isinstance(x,float)) or (len(x)==0)) else x)


In [114]:
keywords_vec_df = pd.DataFrame.from_items(zip(keywords_sel_all_ids.index, 
                                                    keywords_sel_all_ids[1].values)).T
keywords_vec_df.shape

(11484, 200)

In [116]:
%%time

word2vec_keywords_sim_mat_by_year = {}
for year in year_list:
    word2vec_keywords_sim_mat_by_year[year] = pd.DataFrame(data=cosine_similarity(keywords_vec_df.loc[id_list_ordered_by_year[year]]),
                                                          index=id_list_ordered_by_year[year],
                                                          columns=id_list_ordered_by_year[year])

CPU times: user 812 ms, sys: 241 ms, total: 1.05 s
Wall time: 874 ms


In [121]:
# pickle.dump(word2vec_keywords_sim_mat_by_year,
#             open("Output/sim_mat_keyword_word2vec.pkl","wb"),
#             protocol=2)

### word2vec based similarity from `detailed descriptions`

In [123]:
detailed_descriptions = pd.read_csv('/Users/harsheelsoin/Documents/Capstone_Project_Goldman_Sachs/Data/data_by_tables/Detailed_Descriptions.csv',
                                    index_col=0)
detailed_descriptions.shape

(183881, 3)

In [125]:
detailed_descriptions_sel = detailed_descriptions[detailed_descriptions['nct_id'].isin(chosen_ids)].copy()
detailed_descriptions_sel.shape

(7230, 3)

In [128]:
detailed_descriptions_sel = pd.Series(data=detailed_descriptions_sel['description'].values,
                                      index=detailed_descriptions_sel['nct_id'].tolist())
detailed_descriptions_sel.shape

(7230,)

In [129]:
detailed_descriptions_sel = nlp_pipeline(series=detailed_descriptions_sel,
                                         stop_list=stop_list)
detailed_descriptions_sel.shape

(7230,)

In [131]:
%%time
detailed_descriptions_sel = detailed_descriptions_sel.apply(lambda x: gen_vec_from_list(str_list=x,
                                                                                        model=model))

CPU times: user 3min 56s, sys: 4.77 s, total: 4min 1s
Wall time: 4min 12s


In [132]:
detailed_desc_sel_all_ids = pd.concat(objs=[pd.Series(data=all_ids,
                                                     index=all_ids),
                                       detailed_descriptions_sel],
                                      axis=1)
detailed_desc_sel_all_ids.shape

(11484, 2)

In [133]:
detailed_desc_sel_all_ids.drop(labels=[0],
                              axis=1,
                              inplace=True)

In [134]:
#addressing null vectors
detailed_desc_sel_all_ids[1] = detailed_desc_sel_all_ids[1].apply(lambda x: np.array(np.zeros(shape=(200,))) if ((isinstance(x,float)) or (len(x)==0)) else x)


In [135]:
detailed_desc_vec_df = pd.DataFrame.from_items(zip(detailed_desc_sel_all_ids.index, 
                                              detailed_desc_sel_all_ids[1].values)).T
detailed_desc_vec_df.shape

(11484, 200)

In [136]:
%%time

word2vec_detailed_desc_sim_mat_by_year = {}
for year in year_list:
    word2vec_detailed_desc_sim_mat_by_year[year] = pd.DataFrame(data=cosine_similarity(detailed_desc_vec_df.loc[id_list_ordered_by_year[year]]),
                                                          index=id_list_ordered_by_year[year],
                                                          columns=id_list_ordered_by_year[year])

CPU times: user 807 ms, sys: 243 ms, total: 1.05 s
Wall time: 1.09 s


In [137]:
# pickle.dump(word2vec_detailed_desc_sim_mat_by_year,
#             open("Output/sim_mat_d_desc_word2vec.pkl","wb"),
#             protocol=2)