In [None]:
import pandas as pd
train = pd.read_csv("data/train_og.csv")
test = pd.read_csv("data/test_og.csv")
train.columns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Keeping important columns only

In [None]:
cols = ["Drug_high_status", "Disease_of_highest_status", "Drug_Status",
              "GENENAME", "FUNCTION", "BIOCLASS", "SEQUENCE", "Disease"]
X = train.loc[:, cols]
y = train.loc[:, "Target_Status"]
test = test.loc[:, cols]

# Ordinal Encoding on Drug_high_status and Drug_status

In [None]:
import pandas as pd

# Define the mapping for the statuses including "Registered"
def commoniser(text):
    if text in ['Withdrawn from market', 'Discontinued in Phase 4', 'Phase 4', 'Phase 4 Trial']:
        return 'Phase 4'
    elif text in ['Approved (orphan drug)', 'approved', 'NDA filed']:
        return 'Approved'
    elif text in ['Phase 3', 'Discontinued in Phase 3',]:
        return 'Phase 3'
    elif text in ['Phase 2', 'Phase 2 Trial', 'Phase 2a', 'Phase 2b', 'Phase 2/3',]:
     return 'Phase 2'
    elif text in ['Discontinued in Phase 2', 'Discontinued in Phase 2a', 'Discontinued in Phase 2b']:
     return 'Phase 2 Disc'
    elif text in ['Phase 1', 'Phase 1 Trial', 'Phase 1/2',]:
        return 'Phase 1'
    elif text in ['Discontinued in Phase 1', 'Discontinued in Phase 1/2',]:
        return 'Phase 1 Disc'
    elif text in ['Investigative', 'Preclinical', 'Clinical trial', 'Terminated', "Application submitted"]:
        return 'Pre-phase 1'
    elif text in ['Discontinued in Preregistration', 'Patented', 'Registered']:
        return 'pre-pre-fail'
    else:
       return text

def customEncoder(text):
    if text == 'Phase 4':
        return 8 
    elif text == 'Approved':
        return 7 
    elif text == 'Phase 3':
        return 6 
    elif text == "Phase 2":
        return 5 
    elif text == 'Phase 2 Disc':
        return 4 
    elif text == 'Phase 1':
        return 3 
    elif text == "Phase 1 Disc":
        return 2 
    elif text == "Pre-phase 1":
        return 1 
    elif text == 'pre-pre-fail':
        return 0 

In [None]:
X["Drug_high_status"] = X["Drug_high_status"].apply(commoniser)
X["Drug_Status"] = X["Drug_Status"].apply(commoniser)
test["Drug_high_status"] = test["Drug_high_status"].apply(commoniser)
test["Drug_Status"] = test["Drug_Status"].apply(commoniser)

In [None]:
X["Drug_high_status"] = X["Drug_high_status"].apply(customEncoder)
X["Drug_Status"] = X["Drug_Status"].apply(customEncoder)
test["Drug_high_status"] = test["Drug_high_status"].apply(customEncoder)
test["Drug_Status"] = test["Drug_Status"].apply(customEncoder)

# Preprocessing BIOCLASS and FUNCTION for Embeddings 
## (we eventually did make use of NLP, but not the greatest results)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

In [None]:
X["BIOCLASS"] = X["BIOCLASS"].apply(preprocess_text)
X["FUNCTION"] = X["FUNCTION"].apply(preprocess_text)
test["BIOCLASS"] = test["BIOCLASS"].apply(preprocess_text)
test["FUNCTION"] = test["FUNCTION"].apply(preprocess_text)

# Converting sequence

In [None]:
acids = {'C', 'G', 'S', 'H', 'V', 'W', 'L', 'T', 'P', 'F', 'Y', 'R', 'N', 'I', 'D', 'E', 'Q', 'A', 'M', 'K'}
def sequence_converter(text):
    val = 0
    for letter in acids:
        val += text.count(letter) * ord(letter)
        return val

In [None]:
X["Sequence_encoded"] = X["SEQUENCE"].apply(sequence_converter)
test["Sequence_encoded"] = test["SEQUENCE"].apply(sequence_converter)

In [None]:
X["SEQUENCE"] = X["SEQUENCE"].apply(len)
test["SEQUENCE"] = test["SEQUENCE"].apply(len)

In [2]:
X = X.join(X["DRUGNAME"])
test_again = pd.read_csv("test.csv")
test = test.join(test_again["DRUGNAME"])

Unnamed: 0,Drug_high_status,Drug_Status,SEQUENCE,Target_Status,sequence_encode,FUNCTION_encoded,BIOCLASS_encoded,Disease_encoded,Disease_of_highest_status_encoded,GENENAME_encoded,DRUGNAME_encoded,DRUGNAME
0,7,7,2221,Terminated,2436,6.760199,6.031435,5.325765,6.517095,5.807966,5.936541,Solifenacin
1,1,1,599,Approved,870,6.343018,5.68396,6.311307,6.499673,5.533521,5.773524,AM-643


In [None]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available
print(torch.cuda.get_device_name(0))  #

True
Tesla T4


# 1. Embeddings for the FUNCTION column

In [None]:
print(type(X["FUNCTION"].unique()))
print(type(X["FUNCTION"]))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


#  Embeddings for the columns

In [None]:
from transformers import AutoTokenizer, AutoModel
import numpy as np

def generate_embeddings(col):
  unique_functions = train[col].unique()

  tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
  model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model = model.to(device)

  embeddings_dict = {}
  for function in unique_functions:
      inputs = tokenizer(function, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

      with torch.no_grad():
          outputs = model(**inputs)

      embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move to CPU for storage

      embeddings_dict[function] = embedding[0]  # Save the embedding vector
  return embeddings_dict

def fix_test_missing(embeddings_dict):
  print(len(embeddings_dict))
  single_numbers = {key: np.mean(value) * 1000 for key, value in embeddings_dict.items()}
  not_in_train = np.mean(list(single_numbers.values()))
  print(len(single_numbers))
  print(not_in_train)
  return single_numbers, not_in_train

In [None]:
for col in ["BIOCLASS", "FUNCTION", "Disease", "Disease_of_highest_status", "GENENAME", "DRUGNAME"]:
  bioclass_embeddings = generate_embeddings(col)
  print(bioclass_embeddings.keys())
  bioclass_dict, not_in_train = fix_test_missing(bioclass_embeddings)

  def encode_test(text):
    if text in bioclass_dict.keys():
      return bioclass_dict[text]
    else:
      return not_in_train

  train[f"{col}_encoded"] = train[col].map(bioclass_dict)
  test[f"{col}_encoded"] = test[col].apply(encode_test)

589




dict_keys(['Genetic cardiac arrhythmia', 'Rheumatoid arthritis', 'Renal cell carcinoma', 'Hyper-lipoproteinaemia', 'Oesophageal/gastroduodenal disorder', 'Bladder cancer', 'Acute diabete complication', 'Encephalopathy', 'Pain', 'Attention deficit hyperactivity disorder', 'Allergic/hypersensitivity disorder', 'Lung cancer', 'Essential hypertension', 'Cutaneous lupus erythematosus', 'Abnormal micturition', 'Cardiovascular disease', 'Irritable bowel syndrome', 'Prostate cancer', 'Postoperative inflammation', 'Glaucoma', 'Obesity', 'Dissociative neurological symptom disorder', 'Malignant haematopoietic neoplasm', 'Schizophrenia', 'Zoster', 'Post-traumatic stress disorder', 'Anxiety disorder', 'Breast cancer', 'Tic disorder', 'Nausea/vomiting', 'Depression', 'Cerebral ischaemia', 'Low bone mass disorder', 'Urogenital cancer', 'Mild neurocognitive disorder', 'Dystrophic epidermolysis bullosa', 'Ataxic disorder', 'Phlegmy cough', 'Urgency', 'Fatigue', 'Unspecific substance use disorder', 'Typ



dict_keys(['Overactive bladder', 'Dermatological disease', 'Solid tumour/cancer', 'Alzheimer disease', 'Parkinson disease', 'Lipid metabolism disorder', 'Mood disorder', 'Anxiety disorder', 'Influenza A virus infection', 'Renal cell carcinoma', 'Ovarian cancer', 'Urinary incontinence', 'Major depressive disorder', 'Orthostatic hypotension', 'Asthma', 'Obesity', 'Stomach ulcer', 'Lewy body dementia', 'Thymic cancer', 'Diabetic complication', 'Hypertension', 'Attention deficit hyperactivity disorder', 'Psychotic disorder', 'Depression', 'Migraine', 'Systemic mastocytosis', 'Melanoma', 'Nasopharyngeal carcinoma', 'Follicular lymphoma', 'Hepatic fibrosis', 'Endometriosis', 'Rheumatoid arthritis', 'Cystic fibrosis', 'Peptic ulcer', 'Chronic obstructive pulmonary disease', 'Schizophrenia', 'Type-2 diabetes', 'Anaplastic mixed oligoastrocytoma', 'Dementia', 'Lung cancer', 'Pancreatitis', 'Benign prostatic hyperplasia', 'Non-hodgkin lymphoma', 'Giant cell arteritis', 'Prostate hyperplasia', 'N



dict_keys(['CACNA1C', 'PTGS1', 'KDR', 'ADORA1', 'ACHE', 'MTOR', 'INSR', 'XDH', 'SLC6A2', 'HTR1A', 'GSK3B', 'EPHB4', 'CA2', 'JAK1', 'ADRA1A', 'FLT3', 'ABL1', 'HTR2B', 'PIK3CG', 'PDE4B', 'HSD11B1', 'CCKAR', 'HTR2C', 'CDK2', 'AKR1B1', 'AGTR2', 'HTR2A', 'SLC6A4', 'CDK7', 'DRD1', 'DRD2', 'HDAC1', 'AKT3', 'ERBB2', 'DRD5', 'BCHE', 'APP', 'ELANE', 'CHRM5', 'PDE4A', 'PDGFRA', 'PPARA', 'AKT1', 'TEK', 'KIT', 'KCNH2', 'CHRNA7', 'SRC', 'F2', 'ADRA2A', 'ADRA2C', 'EGFR', 'DPP4', 'CYP19A1', 'ESR2', 'TOP1', 'TNF', 'HTR7', 'JAK3', 'OPRD1', 'CYP3A4', 'CNR2', 'HSP90AA1', 'DRD3', 'CHEK1', 'HSPA5', 'SIGMAR1', 'MAPK14', 'ADRB2', 'MAOA', 'BCL2L1', 'JAK2', 'CA6', 'TRPV3', 'OPRM1', 'PTGS1; PTGS2', 'NFE2L2', 'PDE5A', 'CMA1', 'SCN5A', 'CDK1', 'IGF1R', 'ATR', 'FGFR1', 'KCND3', 'TACR1', 'F10', 'MMP1', 'CDK4', 'AVPR1A', 'HTR6', 'HTR1D', 'ADRA1B', 'PLK1', 'FLT1', 'CACNA1B', 'HSP90AB1', 'EDNRB', 'CTSS', 'CTSK', 'CHRM1', 'ESR1', 'TRPV1', 'CYP2C9', 'OPRK1', 'SLC9A3', 'OXTR', 'MC4R', 'HDAC2', 'CASR', 'ADORA2A', 'CD38', '

In [None]:
train = train.drop(["DRUGNAME", "FUNCTION", "GENENAME", "BIOCLASS", "Disease", "Disease_of_highest_status"], axis=1)
test = test.drop(["DRUGNAME", "FUNCTION", "GENENAME", "BIOCLASS", "Disease", "Disease_of_highest_status"], axis=1)

In [None]:
# then trained on this dataset where there were numerical values 
# only, and added Drug_Status and Drug_high_status and got half decent results
# Got a 0.96 and 0.97 with random forest and catboost respectively