In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from itertools import cycle, islice
from operator import itemgetter
import sif_embedding_wrapper
import pandas as pd
import numpy as np
import itertools
import codecs
import utils
import os

In [2]:
# from gensim.models.keyedvectors import KeyedVectors

# model = KeyedVectors.load_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.bin', binary=True)
# model.save_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.txt', binary=False)

# words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-FS.txt", 
#                                                      '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/glove/glove.6B.300d.txt", 
                                                      '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

In [3]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'



In [4]:
df = EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)
df.reset_index(inplace=True)

In [5]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(df["SENTENCE"], embs, words, weight4ind)
df["vector"] = pd.Series(list(doc_embeddings))


In [6]:
ground_truth = {}

for idx, row in df.iterrows():
    ground_truth[idx] = row['CLASS']

# ground_truth

# inv_map = {}
# for k, v in ground_truth.items():
#     inv_map[v] = inv_map.get(v, [])
#     inv_map[v].append(k)
    
# inv_map  

In [7]:
categories = list(df["CLASS"].unique())
categories

['drug_gene', 'gene_disease', 'drug_disease']

In [8]:
# Use LDA to get the topics and assign to class to find top performaning sentences

min_text_length=80
max_iter=150
batch_size=512
learning_offset=300.
n_topics = len(categories)


docs = df

unclassifiable = list(docs[docs["SENTENCE"].map(len) < min_text_length].index)
filtered = docs[~docs.index.isin(unclassifiable)]
ids = [d for d in list(filtered.index)[0:10]]

n_features = 50000
tf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,
    min_df=0.1,
    max_features=n_features)
tf = tf_vectorizer.fit_transform(list(filtered.loc[:, 'SENTENCE']))

lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=max_iter,
    batch_size=batch_size,
    learning_method='online',
    learning_offset=learning_offset,
    random_state=0)

lda.fit(tf)
doc_topics = lda.transform(tf)

In [9]:
topic_leaders = {"topic_{}".format(i): [] for i in iter(range(n_topics))}
doc_id = filtered.index

for idx, probs in enumerate(doc_topics):
    score = max(probs)
    topic = np.argmax(probs)
    doc_id = filtered.index[idx]
    topic_leaders["topic_{}".format(topic)].append({"doc_id": doc_id, "score": score})

for i in iter(range(n_topics)):
    topic_leaders["topic_{}".format(i)] = sorted(
        topic_leaders["topic_{}".format(i)], key=itemgetter('score'), reverse=True)
        
    

# topic_leaders

In [11]:
# select only those sentences which have score more than 65%

sentences = {c:[] for c in categories} 
selected_sentences = {c:[] for c in categories}
sentences_with_score = {c:[] for c in categories}


for each_topic in topic_leaders:
    for each_doc in topic_leaders[each_topic]:
        gt = ground_truth[each_doc['doc_id']]
        sentences[gt].append(each_doc['doc_id'])
        sentences_with_score[gt].append(each_doc['score'])
#         print(each_doc['score'])
        if each_doc['score']>0.66:
             selected_sentences[gt].append(each_doc['doc_id'])
    
# selected_sentences

In [12]:
# Get average/mean of the sentence vectors that represent our topics 
category_vecs = {}
for c in categories:
    vectors = np.asarray(list(df.loc[df.index.isin(selected_sentences[c])].vector))
    category_vecs[c] = np.mean(vectors, axis=0)

    
category_vecs    

{'drug_gene': array([-9.21763579e-02,  6.48885958e-02,  6.85458034e-02, -7.99785523e-04,
         3.01569100e-02,  3.40098047e-02,  3.58546743e-02, -6.11273501e-02,
         4.65268562e-02, -4.79792556e-01, -6.38321862e-02, -7.82524306e-02,
        -1.78122382e-01, -8.08226686e-02,  9.03406505e-02, -1.08390579e-01,
        -5.26091398e-02, -4.97958569e-02, -1.56377439e-01,  3.29611904e-02,
        -2.63228520e-01, -1.18416566e-01, -3.56999115e-02, -4.10079990e-02,
        -3.57352800e-03, -1.79631564e-03,  1.00609849e-01, -1.13228309e-01,
        -8.49533862e-02, -6.12611405e-03, -5.09281553e-02, -1.36621885e-03,
        -2.50830594e-02, -1.39602233e-01,  2.37592623e-01,  9.86697789e-02,
         7.96388733e-02,  3.36417889e-02, -2.40634884e-02,  3.43490317e-03,
        -2.54291986e-02,  2.20848544e-01, -8.73416769e-02, -3.24884805e-01,
        -7.41249685e-02, -3.34264984e-02,  6.33560895e-02, -7.05350135e-02,
         6.05506355e-02,  4.36142824e-02, -4.70595265e-02,  2.83882513e-02,

In [13]:
# Try to predict the label of unknown sentences

predictions = {}

selected_idx = [j for i in selected_sentences.values() for j in i]

for idx, row in df.iterrows():
    if idx in selected_idx:
        max_sim = 0
        winner = 'Unknown'
        for j in category_vecs:
            sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
            if sim > max_sim:
                max_sim = sim
                winner = j
        predictions[idx] = winner
    

In [14]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_pred.append(str(v))
            mis_classified.append(k)

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified_df, miss_pred = get_accuracy_score(predictions, ground_truth)
score

0.8450575089370786

In [15]:
pd.set_option('display.max_colwidth', -1)

miss_calssified_df = df.iloc[miss_classified_df , [12,13]] 
miss_calssified_df['Predicted-CLASS'] = miss_pred
miss_calssified_df

Unnamed: 0,SENTENCE,CLASS,Predicted-CLASS
11,The expression of ABCG2 may explain in part the ineffectiveness of doxorubicin as a single modality treatment for anaplastic thyroid cancer or for treatment of metastatic follicular thyroid cancer.,drug_gene,drug_disease
14,Oral administration of GSK221149A (5 mg/kg) was effective in inhibiting oxytocin-induced uterine contractions after single and multiple (4-day) dosing.,drug_gene,drug_disease
15,Intravenous administration of GSK221149A produced a dose-dependent decrease in oxytocin-induced uterine contractions in anesthetized rats with an ID(50) = 0.27 +/- 0.60 mg/kg (corresponding plasma concentrations were 88 ng/ml).,drug_gene,drug_disease
34,"We found for the first time, to our knowledge, that regulation of TRPV6 gene expression by SCFA may be a molecular mechanism involved in the promotion of calcium absorption by FOS in rats.",drug_gene,gene_disease
161,"Furthermore, peripheral administration of SHA 68 in mice (50 mg/kg i.p.) is able to antagonize NPS-induced horizontal and vertical activity as well as stereotypic behavior.",drug_gene,gene_disease
181,These results suggest that FLT3/TKD+ and FLT3/WT cases should not be differentiated when considering patients for treatment with FLT3 inhibitors.,drug_gene,drug_disease
184,These results suggest that FLT3/TKD+ and FLT3/WT cases should not be differentiated when considering patients for treatment with FLT3 inhibitors.,drug_gene,drug_disease
189,Decreased plasma soluble RAGE in patients with hypercholesterolemia: effects of statins.,drug_gene,drug_disease
192,This increase in CXCR4 levels on CML progenitor cells was likewise found in samples from CML patients treated with imatinib or IFN-alpha.,drug_gene,drug_disease
194,This increase in CXCR4 levels on CML progenitor cells was likewise found in samples from CML patients treated with imatinib or IFN-alpha.,drug_gene,drug_disease


In [16]:
result_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Results'))+'/'
miss_calssified_df.to_csv(result_path+'miss_predictions_train.csv')

In [17]:
# Generalisation on the Unseen Dataset GAD


doc_t_d = codecs.open(data_path+'GAD_Corpus_IBIgroup/'+'GAD_Y_N'+'.csv','rU','UTF-8',errors='ignore') 
GAD_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
GAD_target_disease['CLASS'] = 'gene_disease'


# GAD_target_disease.head(10)


In [18]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(GAD_target_disease["GAD_CONCLUSION"], embs, words, weight4ind)
GAD_target_disease["vector"] = pd.Series(list(doc_embeddings))

In [19]:
test_ground_truth = {}

for idx, row in GAD_target_disease.iterrows():
    test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

test_predictions = {}

for idx, row in GAD_target_disease.iterrows():
    max_sim = 0.60
    winner = 'unknown'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    test_predictions[idx] = winner    

In [20]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_classified.append(k)
            mis_pred.append(str(v))

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified, miss_pred = get_accuracy_score(test_predictions, test_ground_truth)
score

  'recall', 'true', average, warn_for)


0.9193672839506173

In [21]:
miss_calssified_df_test = GAD_target_disease.iloc[miss_classified , [10,11]] 
miss_calssified_df_test['Predicted-CLASS'] = miss_pred
miss_calssified_df_test

Unnamed: 0,GAD_CONCLUSION,CLASS,Predicted-CLASS
11,These results suggest that the GSTT1 null genotype may be associated with increased risk of stomach cancer.,gene_disease,drug_disease
14,"These results suggest that the GSTT1 null-genotype is associated with an increased risk of lung cancer, especially in younger individuals.",gene_disease,drug_disease
15,"These results support previous reports that the GSTM1 null genotype is associated with a modest increase in risk for lung cancer, particularly among heavy smokers, suggest no role for GSTT1 and the need for further study of GSTP1.",gene_disease,drug_disease
29,"These results support previous reports that the GSTM1 null genotype is associated with a modest increase in risk for lung cancer, particularly among heavy smokers, suggest no role for GSTT1 and the need for further study of GSTP1.",gene_disease,drug_disease
31,The results suggest that the GSTM1 null genotype is a risk factor for development of oral cancer among Indian tobacco habitues.,gene_disease,drug_disease
39,"Because our samples provided quite high power, these results indicate that GSK3B may not play a major role in Japanese schizophrenia.",gene_disease,unknown
41,Our results partially support the previous studies in other ethnic groups and indicate that the GRM3 gene may play an important role in the etiology of schizophrenia in the Han Chinese.,gene_disease,unknown
52,We conclude that GRIK2 does not play a major role in the pathogenesis of schizophrenia in the Japanese population.,gene_disease,unknown
53,We conclude that GRIK2 does not play a major role in the pathogenesis of schizophrenia in the Japanese population.,gene_disease,unknown
54,We conclude that GRIK1 does not play a major role in schizophrenia pathogenesis in the Japanese population.,gene_disease,unknown


In [22]:
miss_calssified_df_test.to_csv(result_path+'miss_predictions_test.csv')

In [23]:
#Final model

final_data_1 = df[['SENTENCE', 'CLASS']][~df.index.isin(miss_classified_df)] 
final_data_2 = GAD_target_disease[['GAD_CONCLUSION', 'CLASS']][~GAD_target_disease.index.isin(miss_classified)]

final_data_2.rename(columns={"GAD_CONCLUSION": "SENTENCE"}, inplace = True)

final_data = final_data_1.append(final_data_2)

final_data.reset_index(inplace=True)


doc_embeddings = sif_embedding_wrapper.sentences2vecs(final_data["SENTENCE"], embs, words, weight4ind)
final_data["vector"] = pd.Series(list(doc_embeddings))

In [24]:
ground_truth = {}

for idx, row in final_data.iterrows():
    ground_truth[idx] = row['CLASS']

# ground_truth

inv_map = {}
for k, v in ground_truth.items():
    inv_map[v] = inv_map.get(v, [])
    inv_map[v].append(k)
    
# inv_map  

In [25]:
# Get average/mean of the sentence vectors that represent our topics 

categories = list(final_data["CLASS"].unique())
print(categories)


category_vecs_ = {}
for c in categories:
    vectors = np.asarray(list(final_data.loc[final_data.index.isin(inv_map[c])].vector))
    category_vecs_[c] = np.mean(vectors, axis=0)

    
category_vecs_   

['drug_gene', 'gene_disease', 'drug_disease']


{'drug_gene': array([-5.84117234e-02,  3.42540026e-02,  5.86275959e-02,  2.07618254e-03,
         2.33752728e-02,  1.72464896e-02,  1.78745927e-02, -4.66200741e-02,
         5.90745575e-02, -4.31095539e-01, -7.26349942e-02, -6.03674348e-02,
        -1.60335489e-01, -8.53735657e-02,  6.49005325e-02, -1.15476460e-01,
        -2.35345464e-02, -2.77327931e-02, -1.68155634e-01,  3.88388822e-02,
        -2.32686643e-01, -9.11115533e-02, -5.43350307e-02, -4.33539593e-02,
         1.04753265e-02,  1.45093727e-04,  1.13827275e-01, -8.90614002e-02,
        -6.02734619e-02,  1.90411707e-02, -5.22390855e-02, -3.82286398e-02,
        -6.66095452e-02, -1.24380990e-01,  2.64208687e-01,  7.58140137e-02,
         8.43522905e-02,  5.63492098e-02, -4.07970214e-02,  1.49501832e-02,
        -1.91310809e-02,  2.21602231e-01, -9.61367133e-02, -2.83895174e-01,
        -8.72709651e-02, -4.57258904e-02,  4.65556744e-02, -8.06998022e-02,
         7.16336345e-02,  1.86782703e-02, -3.22412416e-02,  2.62200102e-02,

In [26]:
# Test new sentence

test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
# test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding = sif_embedding_wrapper.sentences2vecs([test_sample], embs, words, weight4ind)

sim = {}
for j in category_vecs:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]

sim    
    

{'drug_gene': 0.5332289395940306,
 'gene_disease': 0.6619145197549249,
 'drug_disease': 0.5203751751376042}

In [27]:
# Test new sentence

test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
# test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding = sif_embedding_wrapper.sentences2vecs([test_sample], embs, words, weight4ind)

sim = {}
for j in category_vecs_:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs_[j].reshape(1, -1)).flatten()[0]

sim    
    

{'drug_gene': 0.5644978737188249,
 'gene_disease': 0.5471364768402079,
 'drug_disease': 0.5181398727083739}

In [59]:
# Testing on the 30 papers dataset


import json

with open(data_path+'EBI Standard/'+'rel_data.json') as json_file:
    data = json.load(json_file)

EBI_standard = []

for each_point in data:
    for each_sent in data[each_point]:
        EBI_standard.append({'SENTENCE': each_sent['sent'], 'CLASS': each_sent['rel']})    
        
EBI_standard_temp = pd.DataFrame(EBI_standard)   

EBI_standard_temp['CLASS'] = EBI_standard_temp['CLASS'].apply(lambda x: x.replace('YGD', 'gene_disease').replace('NGD', 'other').replace('AMB', 'other'))
EBI_standard_temp.to_csv(data_path+'EBI_bronze_standard.csv')

Unnamed: 0,CLASS,SENTENCE
0,other,"As most tobacco control programs in Bangladesh target mainly cigarette or bidi smoking, coordinated programs are needed that will also include SLT use within the tobacco control policy and prevention strategies."
1,other,"SLT can cause oral cancer [4], [5] and nicotine addiction [6] and is associated with several other health conditions including oral pain [7], cardiovascular diseases [8], hypertension [9], diabetes [10], loss in bone density [11], and problems during pregnancy and following childbirth [12]."
2,other,"Also, currently most anti-tobacco campaigns in Bangladesh focuses on cigarette or bidi smoking without any information about SLT use."
3,other,This increased likelihood of SLT use is related to the social acceptance of SLT use by the older people and a greater appeal of cigarette among the younger generations who may be taking up smoking instead of SLT use.
4,other,"Use of SLT may provide a supplementation or substitution of nicotine intake for cigarette smokers [31] encouraging their continued use; this scenario could be particularly true in situations in which smoking is not socially acceptable, but SLT use is."
5,other,"As most tobacco control programs target mainly cigarette or bidi smoking, coordinated programs are needed that will also include SLT use within the tobacco control policy and prevention strategies."
6,other,"Interleukin (IL)-1β, IL-4, IL-6, IL-10, IL-12, IL-13, IL-17, interferon (IFN)-γ, chemokine C-C motif ligand 5 (CCL5) and tumor necrosis factor (TNF)-α were tested in tear samples and sera of keratoconus and control individuals by multiplex immuno-bead assays."
7,gene_disease,"These differences include increased IL-6, and decreased IL-12, TNF-α, IFN-γ, IL-4, IL-13 and CCL5 in keratoconus compared to control tear fluids."
8,gene_disease,"The decreases in IL-12, TNF-α and CCL5 were statistically significant, while the IL-13 decrease was statistically significant in the severe keratoconus group only."
9,gene_disease,"IL-17 could not be detected by multiplex immuno-bead assay, but showed an increase in keratoconus by conventional ELISA on a limited number of pooled tear samples."


In [62]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(EBI_standard_temp["SENTENCE"], embs, words, weight4ind)
EBI_standard_temp["vector"] = pd.Series(list(doc_embeddings))


EBI_test_ground_truth = {}

for idx, row in EBI_standard_temp.iterrows():
    EBI_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

EBI_test_predictions = {}

for idx, row in EBI_standard_temp.iterrows():
    max_sim = 0.60
    winner = 'other'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    EBI_test_predictions[idx] = winner    
    

def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_classified.append(k)
            mis_pred.append(str(v))

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified, miss_pred = get_accuracy_score(EBI_test_predictions, EBI_test_ground_truth)
score    

  'recall', 'true', average, warn_for)


0.3678353131126745

In [63]:
miss_calssified_EBI_test = EBI_standard_temp.iloc[miss_classified , [0,1]] 
miss_calssified_EBI_test['Predicted-CLASS'] = miss_pred
miss_calssified_EBI_test

Unnamed: 0,CLASS,SENTENCE,Predicted-CLASS
1,other,"SLT can cause oral cancer [4], [5] and nicotine addiction [6] and is associated with several other health conditions including oral pain [7], cardiovascular diseases [8], hypertension [9], diabetes [10], loss in bone density [11], and problems during pregnancy and following childbirth [12].",drug_disease
6,other,"Interleukin (IL)-1β, IL-4, IL-6, IL-10, IL-12, IL-13, IL-17, interferon (IFN)-γ, chemokine C-C motif ligand 5 (CCL5) and tumor necrosis factor (TNF)-α were tested in tear samples and sera of keratoconus and control individuals by multiplex immuno-bead assays.",drug_gene
7,gene_disease,"These differences include increased IL-6, and decreased IL-12, TNF-α, IFN-γ, IL-4, IL-13 and CCL5 in keratoconus compared to control tear fluids.",drug_gene
9,gene_disease,"IL-17 could not be detected by multiplex immuno-bead assay, but showed an increase in keratoconus by conventional ELISA on a limited number of pooled tear samples.",other
10,gene_disease,"Our findings confirm increased IL-6, but dispute earlier reports of increased TNF-α, and suggest a cytokine imbalance in keratoconus disrupting corneal homeostasis.",drug_gene
11,gene_disease,"Moreover, an increase in IL-17 suggests tissue degenerative processes at work, contributing to the thinning and weakening of the corneal connective tissue in keratoconus.",drug_disease
12,gene_disease,"Recent studies have suggested pro-inflammatory factors as key to keratoconus pathogenesis based on their findings of elevated interleukin (IL)-6, tumor necrosis factor (TNF)-α and matrix metalloproteinase (MMP)-9 in the tear fluid of keratoconus patients [8], [9].",drug_gene
13,gene_disease,"Despite these initial findings of specific changes in inflammatory cytokines, there are no studies that have examined a range of cytokines to determine whether keratoconus is associated with an imbalance in the repertoire of cytokines that regulate inflammatory and immune responses driven by subsets of T-helper cells, TH1, TH2, and TH17 in the corneal environment.",drug_gene
14,gene_disease,"To begin addressing this question, we quantified TH1 cytokines (IL-12, IFN-γ and TNF-α), TH2 cytokines (IL-4, IL-10 and IL-13), the TH17 representative cytokine IL-17, and other inflammatory cytokines/chemokines (IL-1β, IL-6, and RANTES or CCL5) in tear fluids and serum samples of keratoconus patients and control subjects.",drug_gene
15,gene_disease,"Overall, we found IL-6 levels were 3 fold elevated (based on mean values) in keratoconus samples compared to control subjects (Table 2 and Figure 1), confirming a previous report of increased IL-6 in keratoconus [8].",other
