## Connecting Notebook to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [16]:
import pandas as pd
import numpy as np

# Load the datasets
Data = pd.read_csv('/content/drive/MyDrive/Datasets/GDSC_DATASET.csv')

# Display column names of each dataset to understand their structure
Data.columns

Index(['COSMIC_ID', 'CELL_LINE_NAME', 'TCGA_DESC', 'DRUG_ID', 'DRUG_NAME',
       'LN_IC50', 'AUC', 'Z_SCORE', 'GDSC Tissue descriptor 1',
       'GDSC Tissue descriptor 2', 'Cancer Type (matching TCGA label)',
       'Microsatellite instability Status (MSI)', 'Screen Medium',
       'Growth Properties', 'CNA', 'Gene Expression', 'Methylation', 'TARGET',
       'TARGET_PATHWAY'],
      dtype='object')

In [17]:
Data.shape

(242035, 19)

In [21]:
Data.head().T

Unnamed: 0,0,1,2,3,4
COSMIC_ID,683667,684057,684059,684062,684072
CELL_LINE_NAME,PFSK-1,ES5,ES7,EW-11,SK-ES-1
TCGA_DESC,MB,UNCLASSIFIED,UNCLASSIFIED,UNCLASSIFIED,UNCLASSIFIED
DRUG_ID,1003,1003,1003,1003,1003
DRUG_NAME,Camptothecin,Camptothecin,Camptothecin,Camptothecin,Camptothecin
LN_IC50,-1.463887,-3.360586,-5.04494,-3.741991,-5.142961
AUC,0.93022,0.791072,0.59266,0.734047,0.582439
Z_SCORE,0.433123,-0.599569,-1.516647,-0.807232,-1.570016
GDSC Tissue descriptor 1,nervous_system,bone,bone,bone,bone
GDSC Tissue descriptor 2,medulloblastoma,ewings_sarcoma,ewings_sarcoma,ewings_sarcoma,ewings_sarcoma


In [22]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242035 entries, 0 to 242034
Data columns (total 19 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   COSMIC_ID                                242035 non-null  int64  
 1   CELL_LINE_NAME                           242035 non-null  object 
 2   TCGA_DESC                                240968 non-null  object 
 3   DRUG_ID                                  242035 non-null  int64  
 4   DRUG_NAME                                242035 non-null  object 
 5   LN_IC50                                  242035 non-null  float64
 6   AUC                                      242035 non-null  float64
 7   Z_SCORE                                  242035 non-null  float64
 8   GDSC Tissue descriptor 1                 232669 non-null  object 
 9   GDSC Tissue descriptor 2                 232669 non-null  object 
 10  Cancer Type (matching TCGA label

In [26]:
numberic_features = Data.select_dtypes(include=[np.number])
numberic_features.head().T

Unnamed: 0,0,1,2,3,4
COSMIC_ID,683667.0,684057.0,684059.0,684062.0,684072.0
DRUG_ID,1003.0,1003.0,1003.0,1003.0,1003.0
LN_IC50,-1.463887,-3.360586,-5.04494,-3.741991,-5.142961
AUC,0.93022,0.791072,0.59266,0.734047,0.582439
Z_SCORE,0.433123,-0.599569,-1.516647,-0.807232,-1.570016


In [27]:
categorical_features = Data.select_dtypes(include=object)
categorical_features.head().T

Unnamed: 0,0,1,2,3,4
CELL_LINE_NAME,PFSK-1,ES5,ES7,EW-11,SK-ES-1
TCGA_DESC,MB,UNCLASSIFIED,UNCLASSIFIED,UNCLASSIFIED,UNCLASSIFIED
DRUG_NAME,Camptothecin,Camptothecin,Camptothecin,Camptothecin,Camptothecin
GDSC Tissue descriptor 1,nervous_system,bone,bone,bone,bone
GDSC Tissue descriptor 2,medulloblastoma,ewings_sarcoma,ewings_sarcoma,ewings_sarcoma,ewings_sarcoma
Cancer Type (matching TCGA label),MB,,,,
Microsatellite instability Status (MSI),MSS/MSI-L,MSS/MSI-L,MSS/MSI-L,MSS/MSI-L,MSS/MSI-L
Screen Medium,R,R,R,R,R
Growth Properties,Adherent,Adherent,Adherent,Adherent,Semi-Adherent
CNA,Y,Y,Y,Y,Y


In [28]:
duplicated_rows = Data.duplicated()
sum(duplicated_rows)

0

In [35]:
Data.isnull().sum()

Unnamed: 0,0
COSMIC_ID,0
CELL_LINE_NAME,0
TCGA_DESC,1067
DRUG_ID,0
DRUG_NAME,0
LN_IC50,0
AUC,0
Z_SCORE,0
GDSC Tissue descriptor 1,9366
GDSC Tissue descriptor 2,9366


## Missing Value Handling in GDSC Dataset:

In [38]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def handle_missing_values_by_drug(df):
    knn_imputer = KNNImputer(n_neighbors=5)
    numeric_imputers = {}
    label_encoder = LabelEncoder()

    for drug in df['DRUG_NAME'].unique():
        drug_data = df[df['DRUG_NAME'] == drug].copy()

        # 1. Tissue Descriptors and Cancer Type Handling
        tissue_cols = ['GDSC Tissue descriptor 1', 'GDSC Tissue descriptor 2', 'Cancer Type (matching TCGA label)', 'TCGA_DESC']
        for col in tissue_cols:
            if drug_data[col].isnull().any():
                # Impute based on other tissue information
                for other_col in [c for c in tissue_cols if c != col]:
                    drug_data[col] = drug_data.groupby(other_col)[col].transform(
                        lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown')
                    )
                # If still null, use overall mode
                drug_data[col] = drug_data[col].fillna(drug_data[col].mode()[0] if not drug_data[col].mode().empty else 'Unknown')

        # 2. TARGET Handling
        if drug_data['TARGET'].isnull().all():
            drug_data['TARGET'] = 'Unknown for this drug'
        else:
            known_target = drug_data['TARGET'].dropna().iloc[0]
            drug_data['TARGET'] = drug_data['TARGET'].fillna(known_target)

        # 2. TARGET_PATHWAY Handling
        if drug_data['TARGET_PATHWAY'].isnull().all():
            drug_data['TARGET_PATHWAY'] = 'Unknown for this drug'
        else:
            known_pathway = drug_data['TARGET_PATHWAY'].dropna().iloc[0]
            drug_data['TARGET_PATHWAY'] = drug_data['TARGET_PATHWAY'].fillna(known_pathway)

        # 3. Other Categorical Variables
        other_categorical_cols = ['Microsatellite instability Status (MSI)', 'Screen Medium', 'Growth Properties']
        for col in other_categorical_cols:
            drug_data[col] = drug_data.groupby('GDSC Tissue descriptor 1')[col].transform(
                lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown')
            )

        # 4. Genomic Features Handling
        genomic_features = ['CNA', 'Gene Expression', 'Methylation']
        for feature in genomic_features:
            if drug_data[feature].isnull().any():
                # First, try to impute based on tissue type
                drug_data[feature] = drug_data.groupby(['GDSC Tissue descriptor 1', 'GDSC Tissue descriptor 2'])[feature].transform(
                    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan)
                )
                # If still null, use KNN imputation
                if drug_data[feature].isnull().any():
                    feature_data = pd.get_dummies(drug_data[feature], prefix=feature)
                    imputed_data = knn_imputer.fit_transform(feature_data)
                    imputed_df = pd.DataFrame(imputed_data, columns=feature_data.columns, index=feature_data.index)
                    drug_data[feature] = imputed_df.idxmax(axis=1).str.split('_').str[1]

        # 5. Numeric Variables Handling
        numeric_cols = ['LN_IC50', 'AUC', 'Z_SCORE']

        # Prepare features for imputation
        features_for_imputation = pd.get_dummies(drug_data[genomic_features + ['GDSC Tissue descriptor 1', 'GDSC Tissue descriptor 2']])

        for col in numeric_cols:
            if drug_data[col].isnull().any():
                if col not in numeric_imputers:
                    numeric_imputers[col] = RandomForestRegressor(n_estimators=100, random_state=42)

                available_data = drug_data.dropna(subset=[col])
                if len(available_data) > 10:
                    X_train = features_for_imputation.loc[available_data.index]
                    y_train = available_data[col]
                    numeric_imputers[col].fit(X_train, y_train)

                    missing_data = drug_data[drug_data[col].isnull()]
                    X_missing = features_for_imputation.loc[missing_data.index]
                    drug_data.loc[drug_data[col].isnull(), col] = numeric_imputers[col].predict(X_missing)
                else:
                    # If not enough data, use median grouped by tissue type
                    drug_data[col] = drug_data.groupby(['GDSC Tissue descriptor 1', 'GDSC Tissue descriptor 2'])[col].transform(
                        lambda x: x.fillna(x.median())
                    )

        df.loc[df['DRUG_NAME'] == drug] = drug_data

    return df

Data = handle_missing_values_by_drug(Data)


In [39]:
Data.isnull().sum()

Unnamed: 0,0
COSMIC_ID,0
CELL_LINE_NAME,0
TCGA_DESC,0
DRUG_ID,0
DRUG_NAME,0
LN_IC50,0
AUC,0
Z_SCORE,0
GDSC Tissue descriptor 1,0
GDSC Tissue descriptor 2,0


In [41]:
Data.shape

(242035, 19)