In [6]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from itertools import cycle, islice
from operator import itemgetter
import sif_embedding_wrapper
import pandas as pd
import numpy as np
import itertools
import codecs
import utils
import os

In [1]:
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format('/home/stirunag/models/model_OTAR_200d-3mc-10it.bin', binary=True)

In [4]:
index2word_set = set(model.index2word)

def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec



def WeInVar_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    var_combined = 0.0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, np.divide(model[word], np.var(model[word])))
            var_combined = var_combined + np.divide(1, np.var(model[word]))
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, var_combined+n_words)
    return feature_vec



In [7]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'



In [11]:
df = EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)
df.reset_index(inplace=True)

# doc_embeddings_sif = sif_embedding_wrapper.sentences2vecs(df["SENTENCE"], embs, words, weight4ind)
# df["sif"] = pd.Series(list(doc_embeddings_sif))

# doc_embeddings_avg = avg_feature_vector(df["SENTENCE"], model, 200, index2word_set)
# df["avg"] = pd.Series(list(doc_embeddings_avg))

doc_embeddings_var = df["SENTENCE"].apply(lambda x: WeInVar_feature_vector(x, model, 200, index2word_set))
df["vector"] = pd.Series(list(doc_embeddings_var))

In [12]:
ground_truth = {}

for idx, row in df.iterrows():
    ground_truth[idx] = row['CLASS']

categories = list(df["CLASS"].unique())
categories    

['drug_gene', 'gene_disease', 'drug_disease']

In [13]:
# Use LDA to get the topics and assign to class to find top performaning sentences

min_text_length=80
max_iter=150
batch_size=512
learning_offset=300.
n_topics = len(categories)


docs = df

unclassifiable = list(docs[docs["SENTENCE"].map(len) < min_text_length].index)
filtered = docs[~docs.index.isin(unclassifiable)]
ids = [d for d in list(filtered.index)[0:10]]

n_features = 50000
tf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,
    min_df=0.1,
    max_features=n_features)
tf = tf_vectorizer.fit_transform(list(filtered.loc[:, 'SENTENCE']))

lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=max_iter,
    batch_size=batch_size,
    learning_method='online',
    learning_offset=learning_offset,
    random_state=0)

lda.fit(tf)
doc_topics = lda.transform(tf)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [14]:
topic_leaders = {"topic_{}".format(i): [] for i in iter(range(n_topics))}
doc_id = filtered.index

for idx, probs in enumerate(doc_topics):
    score = max(probs)
    topic = np.argmax(probs)
    doc_id = filtered.index[idx]
    topic_leaders["topic_{}".format(topic)].append({"doc_id": doc_id, "score": score})

for i in iter(range(n_topics)):
    topic_leaders["topic_{}".format(i)] = sorted(
        topic_leaders["topic_{}".format(i)], key=itemgetter('score'), reverse=True)
        
    

# topic_leaders

In [15]:
# select only those sentences which have score more than 65%

sentences = {c:[] for c in categories} 
selected_sentences = {c:[] for c in categories}
sentences_with_score = {c:[] for c in categories}


for each_topic in topic_leaders:
    for each_doc in topic_leaders[each_topic]:
        gt = ground_truth[each_doc['doc_id']]
        sentences[gt].append(each_doc['doc_id'])
        sentences_with_score[gt].append(each_doc['score'])
#         print(each_doc['score'])
        if each_doc['score']>0.66:
             selected_sentences[gt].append(each_doc['doc_id'])
    
# selected_sentences

In [16]:
# Get average/mean of the sentence vectors that represent our topics 
category_vecs = {}
for c in categories:
    vectors = np.asarray(list(df.loc[df.index.isin(selected_sentences[c])].vector))
    category_vecs[c] = np.mean(vectors, axis=0)

    
category_vecs    

{'drug_disease': array([ 8.63343403e-02,  4.09570523e-02,  6.60067871e-02,  5.60073107e-02,
         4.39673215e-02,  5.84295914e-02, -6.87599182e-02, -8.06879252e-03,
         6.37306571e-02, -2.01130390e-01, -6.48503676e-02, -3.32981795e-02,
        -1.13030158e-01,  1.66694090e-01, -1.10361330e-01, -2.93077379e-02,
         7.92240351e-02,  8.15214869e-03,  1.86929293e-02,  2.39240471e-02,
        -8.93710926e-02,  1.40408566e-02, -8.64200965e-02, -6.73879078e-03,
         4.68398333e-02, -5.56215681e-02, -1.19740978e-01,  2.44502388e-02,
         5.04814796e-02,  5.02331629e-02,  1.06763035e-01,  1.03823297e-01,
        -2.56669633e-02, -6.97518839e-03,  2.22598156e-03, -7.46672004e-02,
         6.00506552e-02, -4.06144299e-02,  9.46495235e-02,  9.88385919e-03,
         1.67212216e-04, -1.56298485e-02, -7.34789222e-02,  4.01882362e-03,
        -7.26767704e-02,  5.02941124e-02,  9.82116461e-02,  9.28582773e-02,
        -9.54248086e-02,  9.70574096e-02, -2.36599445e-02,  7.43992552e-

In [17]:
# Try to predict the label of unknown sentences

predictions = {}

selected_idx = [j for i in selected_sentences.values() for j in i]

for idx, row in df.iterrows():
    if idx in selected_idx:
        max_sim = 0
        winner = 'Unknown'
        for j in category_vecs:
            sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
            if sim > max_sim:
                max_sim = sim
                winner = j
        predictions[idx] = winner
    

In [18]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_pred.append(str(v))
            mis_classified.append(k)

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified_df, miss_pred = get_accuracy_score(predictions, ground_truth)
score
# SIF score: 0.86787541612152574
# WeInVar: 0.8720215455345159

0.8720215455345159

In [21]:
pd.set_option('display.max_colwidth', -1)

miss_calssified_df = df.iloc[miss_classified_df , [12,13]] 
miss_calssified_df['Predicted-CLASS'] = miss_pred
# miss_calssified_df

result_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Results'))+'/'
miss_calssified_df.to_csv(result_path+'miss_predictions_train_weinvar.csv')

In [30]:
# Generalisation on the Unseen Dataset GAD

doc_t_d = codecs.open(data_path+'GAD_Corpus_IBIgroup/'+'GAD_Y_N'+'.csv','rU','UTF-8',errors='ignore') 
GAD_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
GAD_target_disease['CLASS'] = 'gene_disease'


# GAD_target_disease.head(10)
doc_embeddings_var = GAD_target_disease["GAD_CONCLUSION"].apply(lambda x: WeInVar_feature_vector(x, model, 200, index2word_set))
GAD_target_disease["vector"] = pd.Series(list(doc_embeddings_var))

In [33]:
GAD_test_ground_truth = {}

for idx, row in GAD_target_disease.iterrows():
    GAD_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

GAD_test_predictions = {}

for idx, row in GAD_target_disease.iterrows():
    max_sim = 0.30
    winner = 'unknown'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    GAD_test_predictions[idx] = winner   
    
    
GAD_score, GAD_miss_classified, GAD_miss_pred = get_accuracy_score(GAD_test_predictions, GAD_test_ground_truth)
GAD_score  # 0.88071928071928085 Sif 
           # 0.9307119679328116     

  'recall', 'true', average, warn_for)


0.9307119679328116

In [17]:
miss_calssified_GAD_test = GAD_target_disease.iloc[GAD_miss_classified , [10,11]] 
miss_calssified_GAD_test['Predicted-CLASS'] = GAD_miss_pred
# miss_calssified_df_test
miss_calssified_GAD_test.to_csv(result_path+'miss_predictions_test.csv')

In [18]:
# Test new sentence

# test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding = sif_embedding_wrapper.sentences2vecs([test_sample], embs, words, weight4ind)

sim = {}
for j in category_vecs:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]

sim    
    

{'drug_disease': 0.24992276297691893,
 'drug_gene': 0.72044882673548982,
 'gene_disease': 0.65276057566855705}

In [37]:
# WeinVar Test new sentence

# test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
# test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding =  WeInVar_feature_vector(test_sample, model, 200, index2word_set)
sim = {}
for j in category_vecs:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]

sim    
    

{'drug_disease': 0.4425118, 'drug_gene': 0.7502236, 'gene_disease': 0.49914157}

In [38]:
# Testing on the 30 papers dataset


import json

with open(data_path+'EBI Standard/'+'rel_data_bronze.json') as json_file:
    data = json.load(json_file)

EBI_standard = []

for each_point in data:
    for each_sent in data[each_point]:
        EBI_standard.append({'SENTENCE': each_sent['sent'], 'CLASS': each_sent['rel']})    
        
EBI_standard_temp = pd.DataFrame(EBI_standard)   

EBI_standard_temp['CLASS'] = EBI_standard_temp['CLASS'].apply(lambda x: x.replace('YGD', 'gene_disease').replace('NGD', 'other').replace('AMB', 'other'))
EBI_standard_temp.to_csv(data_path+'EBI_bronze_standard.csv')

In [45]:
doc_embeddings = EBI_standard_temp["SENTENCE"].apply(lambda x: WeInVar_feature_vector(x, model, 200, index2word_set))
EBI_standard_temp["vector"] = pd.Series(list(doc_embeddings))


EBI_test_ground_truth = {}

for idx, row in EBI_standard_temp.iterrows():
    EBI_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

EBI_test_predictions = {}

for idx, row in EBI_standard_temp.iterrows():
    max_sim = 0.30
    winner = 'other'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    EBI_test_predictions[idx] = winner    
    

EBI_score, EBI_miss_classified, EBI_miss_pred = get_accuracy_score(EBI_test_predictions, EBI_test_ground_truth)
EBI_score    

  'recall', 'true', average, warn_for)


0.5860732009925559

In [42]:
miss_calssified_EBI_test = EBI_standard_temp.iloc[EBI_miss_classified , [0,1]] 
miss_calssified_EBI_test['Predicted-CLASS'] = EBI_miss_pred
miss_calssified_EBI_test

Unnamed: 0,CLASS,SENTENCE,Predicted-CLASS
0,other,"As most tobacco control programs in Bangladesh target mainly cigarette or bidi smoking, coordinated programs are needed that will also include SLT use within the tobacco control policy and prevention strategies.",gene_disease
1,other,"SLT can cause oral cancer [4], [5] and nicotine addiction [6] and is associated with several other health conditions including oral pain [7], cardiovascular diseases [8], hypertension [9], diabetes [10], loss in bone density [11], and problems during pregnancy and following childbirth [12].",drug_disease
3,other,This increased likelihood of SLT use is related to the social acceptance of SLT use by the older people and a greater appeal of cigarette among the younger generations who may be taking up smoking instead of SLT use.,drug_disease
4,other,"Use of SLT may provide a supplementation or substitution of nicotine intake for cigarette smokers [31] encouraging their continued use; this scenario could be particularly true in situations in which smoking is not socially acceptable, but SLT use is.",drug_disease
5,other,"As most tobacco control programs target mainly cigarette or bidi smoking, coordinated programs are needed that will also include SLT use within the tobacco control policy and prevention strategies.",gene_disease
6,other,"Interleukin (IL)-1β, IL-4, IL-6, IL-10, IL-12, IL-13, IL-17, interferon (IFN)-γ, chemokine C-C motif ligand 5 (CCL5) and tumor necrosis factor (TNF)-α were tested in tear samples and sera of keratoconus and control individuals by multiplex immuno-bead assays.",gene_disease
8,gene_disease,"The decreases in IL-12, TNF-α and CCL5 were statistically significant, while the IL-13 decrease was statistically significant in the severe keratoconus group only.",drug_disease
9,gene_disease,"IL-17 could not be detected by multiplex immuno-bead assay, but showed an increase in keratoconus by conventional ELISA on a limited number of pooled tear samples.",other
17,gene_disease,"The decrease in TNF-α seen here, contradicts an earlier observation of increased TNF-α in keratoconus.",other
21,gene_disease,The multiplex immuno-bead assay could not detect IL-17 in control or keratoconus tear samples.,other


In [46]:
# Get PolySearch dataset
ColNames = ['entity_1', 'entity_2', 'Association', 'PMID','Sentence']

Poly_doc_d_t = codecs.open(data_path+'PolySearch/'+'p1_disease_gene_testset.simple'+'.tsv','rU','UTF-8') 
Poly_target_disease = pd.read_csv(Poly_doc_d_t, sep='\t', na_filter = False, names = ColNames)
Poly_target_disease['CLASS'] = 'gene_disease'

doc_embeddings = Poly_target_disease["Sentence"].apply(lambda x: WeInVar_feature_vector(x, model, 200, index2word_set))
Poly_target_disease["vector"] = pd.Series(list(doc_embeddings))

In [47]:
Poly_test_ground_truth = {}

for idx, row in Poly_target_disease.iterrows():
    Poly_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

Poly_test_predictions = {}

for idx, row in Poly_target_disease.iterrows():
    max_sim = 0.30
    winner = 'unknown'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    Poly_test_predictions[idx] = winner   

In [49]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_classified.append(k)
            mis_pred.append(str(v))

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified, miss_pred = get_accuracy_score(Poly_test_predictions, Poly_test_ground_truth)
score    # 0.84768211920529801


  'recall', 'true', average, warn_for)


0.8136363636363636

In [51]:
miss_calssified_Poly_test = Poly_target_disease.loc[miss_classified,['Sentence', 'CLASS']]
miss_calssified_Poly_test['Predicted-CLASS'] = miss_pred
miss_calssified_Poly_test

Unnamed: 0,Sentence,CLASS,Predicted-CLASS
0,"Manning K, Al-Dhalimy M, Finegold M, Grompe M: In vivo suppressor mutations correct a murine model of hereditary tyrosinemia type I. Proc Natl Acad Sci U S A. 1999 Oct 12;96(21):11928-33. Hereditary tyrosinemia type I and alkaptonuria are disorders of tyrosine catabolism caused by deficiency of fumarylacetoacetate hydrolase (FAH) and homogentisic acid dioxygenase (HGD), respectively. Tyrosinemia is a severe childhood disease that affects the liver and kidneys, but alkaptonuria is a more benign adult disorder in comparison. Because HGD is upstream of FAH in the tyrosine pathway, mice doubly mutant in both enzymes were found to be protected from the liver and renal damage of tyrosinemia as hypothesized. Mice mutant at the tyrosinemic locus but heterozygous for alkaptonuria spontaneously developed clonal nodules of functionally normal hepatocytes that were able to rescue the livers of some mice with this genotype. This phenotypic rescue was a result of an inactivating mutation of the wild-type homogentisic acid dioxygenase gene, thus presenting an example of an in vivo suppressor mutation in a mammalian model.",gene_disease,unknown
1,"Suwannarat P, O'Brien K, Perry MB, Sebring N, Bernardini I, Kaiser-Kupfer MI, Rubin BI, Tsilou E, Gerber LH, Gahl WA: Use of nitisinone in patients with alkaptonuria. Metabolism. 2005 Jun;54(6):719-28. Alkaptonuria, a rare autosomal recessive disorder caused by mutations in the HGD gene and deficiency of homogentisate 1,2 dioxygenase, is characterized by ochronosis, arthritis, and daily excretion of gram quantities of homogentisic acid (HGA). Nitisinone, an inhibitor of the enzyme 4-hydroxyphenylpyruvate dioxygenase, can drastically reduce urinary excretion of HGA in individuals with alkaptonuria. We investigated the safety and the HGA-depleting efficacy of nitisinone in an open-label, single-center study of 9 alkaptonuria patients (5 women, 4 men; 35-69 years of age) over the course of 3 to 4 months. Each patient received nitisinone in incremental doses, 0.35 mg bid followed by 1.05 mg bid, and remained on this dosage and a regular diet for 3 months. Nitisinone reduced urinary HGA levels from an average of 4.0 +/- 1.8 (SD) g/day to 0.2 +/- 0.2 g/day ( P tyrosine concentration, initially 68 +/- 18 mmicro mol/L, rose to 760 +/- 181 micro mol/L ( P protein-restricted diet (40 g/day), and their mean plasma tyrosine level fell from 755 +/- 167 to 603 +/- 114 mu mol/L. Six of the 7 patients who received nitisinone for more than 1 week reported decreased pain in their affected joints. Weekly ophthalmologic examinations showed no signs of corneal toxicity. Adverse events included the passing of kidney stones, the recognition of symptoms related to aortic stenosis, and elevation of liver transaminase levels. We conclude that low-dose nitisinone effectively reduced urinary HGA levels in patients with alkaptonuria. Future long-term clinical trials are planned to determine the benefits of nitisinone in preventing joint deterioration and providing pain relief, and its long-term side effects.",gene_disease,drug_disease
2,"Gaucher A, Netter P, Fuare G, Raffoux C, Chanson B, Baumgartner J, Psurel J, Streiff F: [HLA-B27 antigen and alkaptonuria] . Rev Rhum Mal Osteoartic. 1977 Apr;44(4):273-7. Study of urinary homogentisic acid and a determinantion of group HLA were carried out for 36 members of a family spread over three generations with three cases of ochronotic rheumatism in the second generation. Alkaptonuria was discovered in seven other subjects, six of them members of the third generation: urinary elimination was poor, less than 0.60 g/24 hours. There is a certain degree of consanguinity in the family studied here and these findings do not therefore rule out a recessive autosomal transmission of the alkaptonuria. They do however lead to the consideration that alkaptonuria may sometimes be found in heterozygotic subjects. A genetic relationship between HLA complex and alkaptonuria can only be claimed with difficulty from this familial study, but the high frequency of B 27 antigen (29 out of 36 members carring it) leaves room for the hypothesis that the B 27 gene, or more precisely a gene associated with the B 27 gene, plays a part in the development of ochronotic rheumatism.",gene_disease,drug_disease
3,"Forslind K, Wollheim FA, Akesson B, Rydholm U: Alkaptonuria and ochronosis in three siblings. Clin Exp Rheumatol. 1988 Jul-Sep;6(3):289-92. Ascorbic acid treatment monitored by urinary HGA excretion. Patients with alkaptonuria lack homogentisate 1,2-dioxygenase leading to retention of homogentistic acid (HGA) in body fluids and eventually to tissue deposition of oxidation products, giving rise to the clinical picture of ochronosis. Ascorbic acid is a known inhibitor of the enzyme which catalyses the oxidation of homogentisic acid (HGA) to the polymer with affinity for collagen and was used in the treatment of three siblings with alkaptonuria. Ascorbic acid 500 mg bid was administered for 12 months. Two of the siblings tolerated the treatment, and in one the symptoms improved, whereas in the other they worsened. Plasma and urinary levels of HGA were monitored with a new HPLC method. Ascorbic acid is not effective in the treatment of symptomatic ochronosis.",gene_disease,drug_disease
5,"Zhou H, Monack DM, Kayagaki N, Wertz I, Yin J, Wolf B, Dixit VM: Yersinia virulence factor YopJ acts as a deubiquitinase to inhibit NF-kappa B activation. J Exp Med. 2005 Nov 21;202(10):1327-32. The bacterial pathogens of the genus Yersinia, the causative agents of plague, septicemia, and gastrointestinal syndromes, use a type III secretion system to inject virulence factors into host target cells. One virulence factor, YopJ, is essential for the death of infected macrophages and can block host proinflammatory responses by inhibiting both the nuclear factor kappaB (NF-kappaB) and mitogen-activated protein kinase pathways, which might be important for evasion of the host immune response and aid in establishing a systemic infection. Here, we show that YopJ is a promiscuous deubiquitinating enzyme that negatively regulates signaling by removing ubiquitin moieties from critical proteins, such as TRAF2, TRAF6, and IkappaBalpha. In contrast to the cylindromatosis tumor suppressor CYLD, which attenuates NF-kappaB signaling by selectively removing K63-linked polyubiquitin chains that activate IkappaB kinase, YopJ also cleaves K48-linked chains and thereby inhibits proteasomal degradation of IkappaBalpha. YopJ, but not a catalytically inactive YopJ mutant, promoted deubiquitination of cellular proteins and cleaved both K48- and K63-linked polyubiquitin. Moreover, an in vitro assay was established to demonstrate directly the deubiquitinating activity of purified YopJ.",gene_disease,drug_gene
7,"Kovalenko A, Chable-Bessia C, Cantarella G, Israel A, Wallach D, Courtois G: The tumour suppressor CYLD negatively regulates NF-kappaB signalling by deubiquitination. Nature. 2003 Aug 14;424(6950):801-5. NF-kappaB transcription factors have key roles in inflammation, immune response, oncogenesis and protection against apoptosis. In most cells, these factors are kept inactive in the cytoplasm through association with IkappaB inhibitors. After stimulation by various reagents, IkappaB is phosphorylated by the IkappaB kinase (IKK) complex and degraded by the proteasome, allowing NF-kappaB to translocate to the nucleus and activate its target genes. Here we report that CYLD, a tumour suppressor that is mutated in familial cylindromatosis, interacts with NEMO, the regulatory subunit of IKK. CYLD also interacts directly with tumour-necrosis factor receptor (TNFR)-associated factor 2 (TRAF2), an adaptor molecule involved in signalling by members of the family of TNF/nerve growth factor receptors. CYLD has deubiquitinating activity that is directed towards non-K48-linked polyubiquitin chains, and negatively modulates TRAF-mediated activation of IKK, strengthening the notion that ubiquitination is involved in IKK activation by TRAFs and suggesting that CYLD functions in this process. Truncations of CYLD found in cylindromatosis result in reduced enzymatic activity, indicating a link between impaired deubiquitination of CYLD substrates and human pathophysiology.",gene_disease,drug_gene
8,"Zhou H, Monack DM, Kayagaki N, Wertz I, Yin J, Wolf B, Dixit VM: Yersinia virulence factor YopJ acts as a deubiquitinase to inhibit NF-kappa B activation. J Exp Med. 2005 Nov 21;202(10):1327-32. The bacterial pathogens of the genus Yersinia, the causative agents of plague, septicemia, and gastrointestinal syndromes, use a type III secretion system to inject virulence factors into host target cells. One virulence factor, YopJ, is essential for the death of infected macrophages and can block host proinflammatory responses by inhibiting both the nuclear factor kappaB (NF-kappaB) and mitogen-activated protein kinase pathways, which might be important for evasion of the host immune response and aid in establishing a systemic infection. Here, we show that YopJ is a promiscuous deubiquitinating enzyme that negatively regulates signaling by removing ubiquitin moieties from critical proteins, such as TRAF2, TRAF6, and IkappaBalpha. In contrast to the cylindromatosis tumor suppressor CYLD, which attenuates NF-kappaB signaling by selectively removing K63-linked polyubiquitin chains that activate IkappaB kinase, YopJ also cleaves K48-linked chains and thereby inhibits proteasomal degradation of IkappaBalpha. YopJ, but not a catalytically inactive YopJ mutant, promoted deubiquitination of cellular proteins and cleaved both K48- and K63-linked polyubiquitin. Moreover, an in vitro assay was established to demonstrate directly the deubiquitinating activity of purified YopJ.",gene_disease,drug_gene
11,"Stegmeier F, Sowa ME, Nalepa G, Gygi SP, Harper JW, Elledge SJ: The tumor suppressor CYLD regulates entry into mitosis. Proc Natl Acad Sci U S A. 2007 May 22;104(21):8869-74. Epub 2007 May 10. Mutations in the cylindromatosis (CYLD) gene cause benign tumors of skin appendages, referred to as cylindromas. The CYLD gene encodes a deubiquitinating enzyme that removes Lys-63-linked ubiquitin chains from I kappa B kinase signaling components and thereby inhibits NF-kappaB pathway activation. The dysregulation of NF-kappaB activity has been proposed to promote cell transformation in part by increasing apoptosis resistance, but it is not clear whether this is CYLD's only or predominant tumor-suppressing function. Here, we show that CYLD is also required for timely entry into mitosis. Consistent with a cell-cycle regulatory function, CYLD localizes to microtubules in interphase and the midbody during telophase, and its protein levels decrease as cells exit from mitosis. We identified the protein kinase Plk1 as a potential target of CYLD in the regulation of mitotic entry, based on their physical interaction and similar loss-of-function and overexpression phenotypes. Our findings raise the possibility that, as with other genes regulating tumorigenesis, CYLD has not only tumor-suppressing (apoptosis regulation) but also tumor-promoting activities (enhancer of mitotic entry). We propose that this additional function of CYLD could provide an explanation for the benign nature of most cylindroma lesions.",gene_disease,drug_gene
17,"Wall NR, Shi Y: Small RNA: can RNA interference be exploited for therapy? . Lancet. 2003 Oct 25;362(9393):1401-3. CONTEXT: RNA interference (RNAi) is the sequence-specific gene-silencing induced by double-stranded RNA (dsRNA), and gives information about gene function quickly, easily, and inexpensively. The use of RNAi for genetic-based therapies is widely studied, especially in viral infections, cancers, and inherited genetic disorders. RNAi has been used to make tissue-specific knockdown mice for studying gene function in a whole animal. Combined with genomics data, RNAi-directed gene-silencing could allow functional determination of any gene expressed in a cell or pathway. The term RNAi came from the discovery that the injection of dsRNAs into Caenorhabditis elegans interferes with the expression of specific genes containing a complementary region to the delivered dsRNA. Although stalled for a time by the non-gene-specific interferon response elicited by dsRNA molecules longer than about 30 nucleotides in mammalian cells, Tom Tuschl's group found that transfection of synthetic 21-nucleotide small-interfering RNA (siRNA) duplexes were highly selective and sequence-specific inhibitors of endogenous genes. STARTING POINT: siRNA expression has been studied with siRNA from plasmid and viral vectors that efficiently deliver siRNAs into both dividing and non-dividing cells, stem cells, zygotes, and their differentiated progeny. A collection of RNA interference vectors that suppress 50 human de-ubiquitinating enzymes allowed Thijn Brummelkamp and colleagues to study this gene family and to identify de-ubiquitinating enzymes in cancer-relevant pathways (Nature 2003; 424: 797-801). These researchers found that the familial cylindromatosis tumour suppressor gene (CYLD), previously of unknown function, could enhance the activation of the transcription factor NF-kappaB, leading to increased resistance to apoptosis. They have now started to investigate the use of CYLD inhibitors in clinical trials. WHERE NEXT: The ability to efficiently and stably produce and deliver sufficient amounts of siRNA to the proper target tissues require refinement before this new technology can be tried clinically. Initial in-vivo studies reported effective transgene suppression in adult mice by chemically synthesised siRNAs. More recently many researchers have used plasmid and viral vectors for transcription of short-hairpin RNAs, both in vitro and in vivo. With these expression systems, gene expression was more stably inhibited than with the transient knockdown recorded with chemically synthesised siRNA. Human trials exploiting these latest findings are likely to soon follow.",gene_disease,drug_gene
18,"Kovalenko A, Chable-Bessia C, Cantarella G, Israel A, Wallach D, Courtois G: The tumour suppressor CYLD negatively regulates NF-kappaB signalling by deubiquitination. Nature. 2003 Aug 14;424(6950):801-5. NF-kappaB transcription factors have key roles in inflammation, immune response, oncogenesis and protection against apoptosis. In most cells, these factors are kept inactive in the cytoplasm through association with IkappaB inhibitors. After stimulation by various reagents, IkappaB is phosphorylated by the IkappaB kinase (IKK) complex and degraded by the proteasome, allowing NF-kappaB to translocate to the nucleus and activate its target genes. Here we report that CYLD, a tumour suppressor that is mutated in familial cylindromatosis, interacts with NEMO, the regulatory subunit of IKK. CYLD also interacts directly with tumour-necrosis factor receptor (TNFR)-associated factor 2 (TRAF2), an adaptor molecule involved in signalling by members of the family of TNF/nerve growth factor receptors. CYLD has deubiquitinating activity that is directed towards non-K48-linked polyubiquitin chains, and negatively modulates TRAF-mediated activation of IKK, strengthening the notion that ubiquitination is involved in IKK activation by TRAFs and suggesting that CYLD functions in this process. Truncations of CYLD found in cylindromatosis result in reduced enzymatic activity, indicating a link between impaired deubiquitination of CYLD substrates and human pathophysiology.",gene_disease,drug_gene
