In [1]:
from pykeen.triples import TriplesFactory

## Checking out training data

In [2]:
training = TriplesFactory(path="kg/train.hrt.txt")

In [3]:
head_to_id = training.entity_to_id

In [4]:
relation_to_id = training.relation_to_id

In [5]:
len(head_to_id)

133369

In [6]:
len(relation_to_id)

4

In [7]:
training.get_most_frequent_relations(n=4)

{'GENE_DISEASE_ot_genetic_association',
 'GENE_DISEASE_ot_rna_expression',
 'VARIANT_DISEASE_associated',
 'VARIANT_GENE_variant'}

In [8]:
training.num_triples

398964

In [9]:
df = training.tensor_to_df(training.mapped_triples)

In [10]:
df.relation_label.value_counts()

VARIANT_DISEASE_associated             127074
GENE_DISEASE_ot_rna_expression         123668
VARIANT_GENE_variant                    84356
GENE_DISEASE_ot_genetic_association     63866
Name: relation_label, dtype: int64

There is a class imbalance in the data, which can be expected. However, care to handle this might help improving the model accuracy

## Checking out validation and test data

In [12]:
valid = TriplesFactory(
path="kg/valid.hrt.txt", entity_to_id=training.entity_to_id, relation_to_id=training.relation_to_id,
)
testing = TriplesFactory(
path="kg/test.hrt.txt", entity_to_id=training.entity_to_id, relation_to_id=training.relation_to_id,
)

You're trying to map triples with 4465 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 4460 from 49755 triples were filtered out
You're trying to map triples with 4453 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 4451 from 49815 triples were filtered out


In [13]:
valid.num_entities

39526

In [14]:
valid.num_triples

45295

In [15]:
valid_df = valid.tensor_to_df(valid.mapped_triples)

In [17]:
valid_df.head()

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label
0,47,A2M,1,GENE_DISEASE_ot_rna_expression,37505,hepatitis_C_virus_induced_hepatocellular_carci...
1,47,A2M,1,GENE_DISEASE_ot_rna_expression,37663,lung_adenocarcinoma
2,52,A4GALT,0,GENE_DISEASE_ot_genetic_association,37566,intelligence
3,57,AADAC,1,GENE_DISEASE_ot_rna_expression,37003,aldosterone-producing_adenoma
4,61,AADACP1,1,GENE_DISEASE_ot_rna_expression,133208,squamous_cell_lung_carcinoma


In [18]:
valid_df.relation_label.value_counts()

GENE_DISEASE_ot_rna_expression         14441
VARIANT_DISEASE_associated             13983
VARIANT_GENE_variant                    8855
GENE_DISEASE_ot_genetic_association     8016
Name: relation_label, dtype: int64

In [20]:
testing.num_entities

39624

In [21]:
testing.num_triples

45364

In [22]:
test_df = testing.tensor_to_df(testing.mapped_triples)

In [23]:
test_df.head()

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label
0,46,A1CF,1,GENE_DISEASE_ot_rna_expression,37130,breast_carcinoma
1,47,A2M,0,GENE_DISEASE_ot_genetic_association,36975,adolescent_idiopathic_scoliosis
2,47,A2M,1,GENE_DISEASE_ot_rna_expression,20255,Malignant_Mesothelioma
3,49,A2ML1,1,GENE_DISEASE_ot_rna_expression,37369,esophageal_squamous_cell_carcinoma
4,52,A4GALT,0,GENE_DISEASE_ot_genetic_association,37492,hematocrit


In [24]:
test_df.relation_label.value_counts()

GENE_DISEASE_ot_rna_expression         14428
VARIANT_DISEASE_associated             14301
VARIANT_GENE_variant                    8800
GENE_DISEASE_ot_genetic_association     7835
Name: relation_label, dtype: int64

In [42]:
def get_label_ditribution(df_shape, value_counts, label): 
    value_counts = dict(value_counts)
    return {f"{label}_{val}": round(value_counts[val]/df_shape, 4) for val in value_counts}

In [43]:
get_label_ditribution(test_df.shape[0], test_df.relation_label.value_counts(), "test")

{'test_GENE_DISEASE_ot_rna_expression': 0.318,
 'test_VARIANT_DISEASE_associated': 0.3152,
 'test_VARIANT_GENE_variant': 0.194,
 'test_GENE_DISEASE_ot_genetic_association': 0.1727}

In [44]:
get_label_ditribution(valid_df.shape[0], valid_df.relation_label.value_counts(), "valid")

{'valid_GENE_DISEASE_ot_rna_expression': 0.3188,
 'valid_VARIANT_DISEASE_associated': 0.3087,
 'valid_VARIANT_GENE_variant': 0.1955,
 'valid_GENE_DISEASE_ot_genetic_association': 0.177}

In [46]:
get_label_ditribution(df.shape[0], df.relation_label.value_counts(), "train")

{'train_VARIANT_DISEASE_associated': 0.3185,
 'train_GENE_DISEASE_ot_rna_expression': 0.31,
 'train_VARIANT_GENE_variant': 0.2114,
 'train_GENE_DISEASE_ot_genetic_association': 0.1601}

- The distribution of points is similar in validation and test data - inidicates we can pretty closely estimate the generalization error. 
- While the distribution for the two biggest classes isn't very different between train and validation set, maybe following a stratified train-test-split can help if required

#### Proceeding with running the code and training the model using config given in base model

In [1]:
from model_training import KnowledgeGraphModel

In [2]:
kg = KnowledgeGraphModel(version_code=False, version_data=False)

  and should_run_async(code)
You're trying to map triples with 4465 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 4460 from 49755 triples were filtered out
You're trying to map triples with 4453 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 4451 from 49815 triples were filtered out


In [3]:
kg()

In [4]:
tail_list = ['PHYHIPL', 'TTC9', 'BRCA1']
for tail in tail_list:
    print(f"{tail}: \n {kg.predict(tail=tail)}\n")

PHYHIPL: 
         head_id                 head_label     score  in_training
6168       6168                 AP000365.1  0.000217        False
54300     54300                rs121908549  0.000206        False
127808   127808                rs879255637  0.000206        False
35424     35424                      UQCC2  0.000203        False
37367     37367  erythropoetin_measurement  0.000173        False

TTC9: 
         head_id   head_label     score  in_training
62100     62100  rs139083043  0.000167        False
95731     95731  rs561736395  0.000135        False
72141     72141  rs199472887  0.000130        False
127917   127917  rs886037613  0.000111        False
110062   110062   rs73745348  0.000109        False

BRCA1: 
         head_id    head_label     score  in_training
7894       7894         BRCA1  0.000074        False
70989     70989   rs193302891  0.000058        False
54300     54300   rs121908549  0.000058        False
103113   103113    rs63750610  0.000054        Fal