In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

In [2]:
import pandas as pd
train = pd.read_csv("data/train_og.csv")
test = pd.read_csv("data/test_og.csv")
train.columns

Index(['ID', 'TargetID', 'DRUGID', 'DRUGTYPE', 'Drug_high_status', 'DRUGNAME',
       'PUBCHCID', 'Disease_of_highest_status', 'Drug_Status', 'UNIPROID',
       'TARGNAME', 'GENENAME', 'SYNONYMS', 'FUNCTION', 'BIOCLASS', 'SEQUENCE',
       'Disease', 'Accession Number', 'Target_Status'],
      dtype='object')

# Keeping important columns only

In [3]:
cols = ["Drug_high_status", "Disease_of_highest_status", "Drug_Status",
              "GENENAME", "FUNCTION", "BIOCLASS", "SEQUENCE", "Disease"]
X = train.loc[:, cols]
y = train.loc[:, "Target_Status"]
test = test.loc[:, cols]

# Ordinal Encoding on Drug_high_status and Drug_status

In [4]:
import pandas as pd

# Define the mapping for the statuses including "Registered"
def commoniser(text):
    if text in ['Withdrawn from market', 'Discontinued in Phase 4', 'Phase 4', 'Phase 4 Trial']:
        return 'Phase 4'
    elif text in ['Approved (orphan drug)', 'approved', 'NDA filed']:
        return 'Approved'
    elif text in ['Phase 3', 'Discontinued in Phase 3',]:
        return 'Phase 3'
    elif text in ['Phase 2', 'Phase 2 Trial', 'Phase 2a', 'Phase 2b', 'Phase 2/3',]:
     return 'Phase 2'
    elif text in ['Discontinued in Phase 2', 'Discontinued in Phase 2a', 'Discontinued in Phase 2b']:
     return 'Phase 2 Disc'
    elif text in ['Phase 1', 'Phase 1 Trial', 'Phase 1/2',]:
        return 'Phase 1'
    elif text in ['Discontinued in Phase 1', 'Discontinued in Phase 1/2',]:
        return 'Phase 1 Disc'
    elif text in ['Investigative', 'Preclinical', 'Clinical trial', 'Terminated', "Application submitted"]:
        return 'Pre-phase 1'
    elif text in ['Discontinued in Preregistration', 'Patented', 'Registered']:
        return 'pre-pre-fail'
    else:
       return text

def customEncoder(text):
    if text == 'Phase 4':
        return 8 
    elif text == 'Approved':
        return 7 
    elif text == 'Phase 3':
        return 6 
    elif text == "Phase 2":
        return 5 
    elif text == 'Phase 2 Disc':
        return 4 
    elif text == 'Phase 1':
        return 3 
    elif text == "Phase 1 Disc":
        return 2 
    elif text == "Pre-phase 1":
        return 1 
    elif text == 'pre-pre-fail':
        return 0 

In [5]:
X["Drug_high_status"] = X["Drug_high_status"].apply(commoniser)
X["Drug_Status"] = X["Drug_Status"].apply(commoniser)
test["Drug_high_status"] = test["Drug_high_status"].apply(commoniser)
test["Drug_Status"] = test["Drug_Status"].apply(commoniser)

In [6]:
X["Drug_high_status"] = X["Drug_high_status"].apply(customEncoder)
X["Drug_Status"] = X["Drug_Status"].apply(customEncoder)
test["Drug_high_status"] = test["Drug_high_status"].apply(customEncoder)
test["Drug_Status"] = test["Drug_Status"].apply(customEncoder)

# Preprocessing BIOCLASS and FUNCTION for Embeddings 
## (we eventually did make use of NLP, but not the greatest results)

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)



In [None]:
X["BIOCLASS"] = X["BIOCLASS"].apply(preprocess_text)
X["FUNCTION"] = X["FUNCTION"].apply(preprocess_text)
test["BIOCLASS"] = test["BIOCLASS"].apply(preprocess_text)
test["FUNCTION"] = test["FUNCTION"].apply(preprocess_text)

# Converting sequence

In [8]:
acids = {'C', 'G', 'S', 'H', 'V', 'W', 'L', 'T', 'P', 'F', 'Y', 'R', 'N', 'I', 'D', 'E', 'Q', 'A', 'M', 'K'}
def sequence_converter(text):
    val = 0
    for letter in acids:
        val += text.count(letter) * ord(letter)
        return val

In [9]:
X["Sequence_encoded"] = X["SEQUENCE"].apply(sequence_converter)
test["Sequence_encoded"] = test["SEQUENCE"].apply(sequence_converter)

In [10]:
X["SEQUENCE"] = X["SEQUENCE"].apply(len)
test["SEQUENCE"] = test["SEQUENCE"].apply(len)

### Encoding categorical variables

In [3]:
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

X = X.drop("Target_Status", axis = 1)
y = X["Target_Status"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

### F1 score function

In [5]:
def calculate_f1_score(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    f1_score_train = f1_score(y_train, train_preds, average='weighted')
    f1_score_test = f1_score(y_test, test_preds, average='weighted')
    return f1_score_train, f1_score_test

## Random Forest Classifier

In [11]:
rf_model = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1, verbose=1,
                        max_depth=20, criterion='entropy', class_weight='balanced')
rf_f1_score_train, rf_f1_score_test = calculate_f1_score(rf_model, X_train, y_train, X_test, y_test)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    2.1s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=20)]: Done 150 out of 150 | elapsed:    1.1s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 150 out of 150 | elapsed:    0.2s finished


In [12]:
print(f'Train Random Forest F1 Score: {rf_f1_score_train}')
print(f'Test Random Forest F1 Score: {rf_f1_score_test}')

Train Random Forest F1 Score: 0.9807539494216908
Test Random Forest F1 Score: 0.9325091650759363


## XGBoost Classifier

In [6]:
from xgboost import XGBClassifier

In [9]:

xgb_model = XGBClassifier(n_estimators=150, random_state=42, n_jobs=-1, max_depth=10, learning_rate=0.1,
                          )
xgb_f1_score_train, xgb_f1_score_test = calculate_f1_score(xgb_model, X_train, y_train, X_test, y_test)


In [10]:
print(f'Train XGBoost F1 Score: {xgb_f1_score_train}')
print(f'Test XGBoost F1 Score: {xgb_f1_score_test}')


Train XGBoost F1 Score: 0.9790766729381526
Test XGBoost F1 Score: 0.9612383521014383


In [7]:
xgb_model = XGBClassifier(n_estimators=150, random_state=42, n_jobs=-1, max_depth=20, learning_rate=0.1,
                          )
xgb_f1_score_train, xgb_f1_score_test = calculate_f1_score(xgb_model, X_train, y_train, X_test, y_test)


In [8]:
print(f'Train XGBoost F1 Score: {xgb_f1_score_train}')
print(f'Test XGBoost F1 Score: {xgb_f1_score_test}')

Train XGBoost F1 Score: 0.9825658836022997
Test XGBoost F1 Score: 0.9583165721463596


## Inferencing

In [10]:
import pandas as pd
import numpy as np

In [11]:
# Encoding categorical variables
for column in test.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        le = label_encoders[column]
       
        unseen_labels = set(test[column]) - set(le.classes_)
        if unseen_labels:
            print(unseen_labels)
            
            le.classes_ = np.append(le.classes_, list(unseen_labels))
        test[column] = le.transform(test[column])

{'Dermatitis herpetiformis'}
{'USP1', 'NPY4R', 'PRKCI', 'PDK2', 'MGMT', 'RXFP1', 'Bact ileS', 'CENPE', 'SETD7', 'LSS'}
{'repair methylate nucleobase dna stoichiometrically transfer methyl group cysteine residue enzyme suicide reaction enzyme irreversibly inactivate involve cellular defense biological effect methylguanine meg methylthymine meet dna', 'necessary bcr abl oncogene mediate resistance apoptotic drug leukemia cell protect leukemia cell drug induce apoptosis cultured neuron prevent amyloid beta protein induce apoptosis interrupt cell death process early step glioblastoma cell function downstream phosphatidylinositol kinase promotion cell survival phosphorylate inhibit pro apoptotic factor bad form protein complex non small cell lung cancer nsclc cell regulate oncogenic activity phosphorylation turn promote transform growth invasion response nerve growth factor ngf act downstream src phosphorylate activate allow subsequent activation nf kappa b neuronal cell survival function o

In [12]:
test_predictions = xgb_model.predict(test)

In [13]:
predictions = label_encoders['Target_Status'].inverse_transform(test_predictions)

  y = column_or_1d(y, warn=True)


In [14]:
df_for_ID = pd.read_csv('data/test_og.csv')

In [15]:
output_df = pd.DataFrame({
    'ID': df_for_ID['ID'],
    'Target_Status': predictions
})

In [16]:
output_df.head(10)

Unnamed: 0,ID,Target_Status
0,1076,Approved
1,190816,Phase 3
2,180551,Phase 3
3,51630,Approved
4,50566,Phase 1/2
5,148985,Phase 2
6,71409,Phase 2
7,18091,Investigative
8,39903,Phase 1/2
9,178779,Patented


## .932 score

In [None]:
output_df.to_csv('predictions.csv', index=False)

## .954 score

In [None]:
output_df.to_csv('xgb_predictions.csv', index=False)

## .958 score

In [None]:
output_df.to_csv('xgb_pluspredictions.csv', index=False)