In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
# Load data
adr_data = pd.read_csv("drugbank_smile_data_with_adrs.csv")

In [15]:
# Track null data
null_indices = adr_data["adrs"].isnull()

In [17]:
num_data = len(adr_data["adrs"])-sum(null_indices)
print("Number of ADR data points: " + str(num_data) + " out of " + str(len(adr_data["adrs"])))

Number of ADR data points: 1181 out of 10630


In [20]:
filtered_data_dict = dict()
unlabeled_data_dict = dict()
column_names = ["drug_id", "name", "smile", "adrs"]
for col in column_names:
    filtered_data_dict[col] = []
    if col != "adrs":
        unlabeled_data_dict[col] = []
    
for index, row in adr_data.iterrows():
    if not null_indices[index]:
        filtered_data_dict["drug_id"].append(row["drug_id"])
        filtered_data_dict["name"].append(row["name"])
        filtered_data_dict["smile"].append(row["smile"])
        filtered_data_dict["adrs"].append(row["adrs"])
    else:
        unlabeled_data_dict["drug_id"].append(row["drug_id"])
        unlabeled_data_dict["name"].append(row["name"])
        unlabeled_data_dict["smile"].append(row["smile"])

In [21]:
# Convert data dictionary to dataframe and then CSV. Write CSV to disk.
filtered_data_df = pd.DataFrame(data=filtered_data_dict)
filtered_data_df.to_csv("adr_data_filtered.csv", index=False)

unlabeled_data_df = pd.DataFrame(data=unlabeled_data_dict)
unlabeled_data_df.to_csv("adr_data_unlabeled.csv", index=False)

In [23]:
# Generate train-test split (80-20).
train_data_df, test_data_df = train_test_split(filtered_data_df, train_size=0.8, test_size=0.2)

In [24]:
train_data_df.to_csv("adr_data_filtered_train.csv", index=False)
test_data_df.to_csv("adr_data_filtered_test.csv", index=False)