In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Load the synthetic dataset
df = pd.read_csv('/content/synthetic_qt_prolongation_dataset.csv')

# Check the dataset structure
print(df.head())


    Drug1   Drug2                                   Drug1_Properties  \
0  Drug_F  Drug_A  {'molecular_weight': 246.7983561008608, 'logP'...   
1  Drug_F  Drug_H  {'molecular_weight': 246.7983561008608, 'logP'...   
2  Drug_G  Drug_C  {'molecular_weight': 217.42508365045984, 'logP...   
3  Drug_F  Drug_C  {'molecular_weight': 246.7983561008608, 'logP'...   
4  Drug_C  Drug_A  {'molecular_weight': 419.59818254342156, 'logP...   

                                    Drug2_Properties Interaction_Type  \
0  {'molecular_weight': 312.36203565420874, 'logP...          Neutral   
1  {'molecular_weight': 459.85284373248055, 'logP...          Neutral   
2  {'molecular_weight': 419.59818254342156, 'logP...          Neutral   
3  {'molecular_weight': 419.59818254342156, 'logP...       Beneficial   
4  {'molecular_weight': 312.36203565420874, 'logP...          Neutral   

   QT_Prolongation_Risk Side_Effects  Dosage_Drug1_mg  Dosage_Drug2_mg  
0                     0      Fatigue       195.087154  

In [None]:
# Encoding categorical variables
label_encoder = LabelEncoder()

# Encoding Drug1 and Drug2
df['Drug1'] = label_encoder.fit_transform(df['Drug1'])
df['Drug2'] = label_encoder.fit_transform(df['Drug2'])

# Encoding Interaction_Type and Side_Effects
df['Interaction_Type'] = label_encoder.fit_transform(df['Interaction_Type'])
df['Side_Effects'] = label_encoder.fit_transform(df['Side_Effects'])

# Extract features (X) and label (y)
X = df.drop(columns=['QT_Prolongation_Risk', 'Drug1_Properties', 'Drug2_Properties'])
y = df['QT_Prolongation_Risk']


In [None]:
# Scaling the numeric features
scaler = StandardScaler()
X[['Dosage_Drug1_mg', 'Dosage_Drug2_mg']] = scaler.fit_transform(X[['Dosage_Drug1_mg', 'Dosage_Drug2_mg']])


In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")


Training set size: (80, 6)
Test set size: (20, 6)


In [None]:
# Training the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# Making predictions on the test set
y_pred = model.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 70.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.82        16
           1       0.00      0.00      0.00         4

    accuracy                           0.70        20
   macro avg       0.39      0.44      0.41        20
weighted avg       0.62      0.70      0.66        20

Confusion Matrix:
[[14  2]
 [ 4  0]]


In [None]:
# Get feature importance from the Random Forest model
feature_importance = model.feature_importances_
feature_names = X.columns

# Display feature importance
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(importance_df)


Feature Importance:
            Feature  Importance
5   Dosage_Drug2_mg    0.261531
4   Dosage_Drug1_mg    0.238132
0             Drug1    0.152269
1             Drug2    0.143923
3      Side_Effects    0.120856
2  Interaction_Type    0.083289


In [None]:

import pandas as pd

# Load the original dataset first
original_data_path = '/content/expanded_qt_prolongation_dataset.csv'
data = pd.read_csv(original_data_path)

# Generate additional rows
new_data_rows = 50

# Define the new expanded dataset
expanded_data = pd.DataFrame({
    'Drug1': [f'Drug_{chr(65 + i)}' for i in np.random.randint(0, 26, new_data_rows)],
    'Drug2': [f'Drug_{chr(65 + j)}' for j in np.random.randint(0, 26, new_data_rows)],
    'Drug1_Properties': [f'{{"mol_weight": {round(np.random.uniform(180, 500), 2)}, "logP": {round(np.random.uniform(1, 5), 2)}}}' for _ in range(new_data_rows)],
    'Drug2_Properties': [f'{{"mol_weight": {round(np.random.uniform(180, 500), 2)}, "logP": {round(np.random.uniform(1, 5), 2)}}}' for _ in range(new_data_rows)],
    'Interaction_Type': np.random.choice(['Neutral', 'Antagonistic', 'Beneficial'], size=new_data_rows),
    'QT_Prolongation_Risk': np.random.choice([0, 1], size=new_data_rows),
    'Side_Effects': np.random.choice(['None', 'Nausea', 'Dizziness', 'Arrhythmia'], size=new_data_rows),
    'Dosage_Drug1_mg': np.random.randint(50, 500, new_data_rows),
    'Dosage_Drug2_mg': np.random.randint(50, 500, new_data_rows),
})

# Combining with the original dataset
full_expanded_data = pd.concat([data, expanded_data], ignore_index=True)

# Saving the expanded dataset to a new CSV file
expanded_file_path = '/content/expanded_qt_prolongation_dataset.csv'
full_expanded_data.to_csv(expanded_file_path, index=False)

expanded_file_path


NameError: name 'np' is not defined

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the expanded dataset
data = pd.read_csv('expanded_qt_prolongation_dataset.csv')

# Convert Drug properties from JSON strings to usable features
import ast
data['Drug1_Properties'] = data['Drug1_Properties'].apply(ast.literal_eval)
data['Drug2_Properties'] = data['Drug2_Properties'].apply(ast.literal_eval)

# Extract relevant properties into separate columns
data['Drug1_Mol_Weight'] = data['Drug1_Properties'].apply(lambda x: x['mol_weight'])
data['Drug1_logP'] = data['Drug1_Properties'].apply(lambda x: x['logP'])
data['Drug2_Mol_Weight'] = data['Drug2_Properties'].apply(lambda x: x['mol_weight'])
data['Drug2_logP'] = data['Drug2_Properties'].apply(lambda x: x['logP'])

# Features we will use
features = ['Drug1_Mol_Weight', 'Drug1_logP', 'Drug2_Mol_Weight', 'Drug2_logP', 'Dosage_Drug1_mg', 'Dosage_Drug2_mg']

# Target label
target = 'QT_Prolongation_Risk'

# Prepare the feature matrix X and target vector y
X = data[features]
y = data[target]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data (optional but recommended for some models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


KeyError: 'mol_weight'

In [None]:
feature_importance = model.feature_importances_
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(importance_df)


In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X, y, cv=5)


In [None]:
input_data = {
    'Drug1_Mol_Weight': 195.0,
    'Drug1_logP': 3.0,
    'Drug2_Mol_Weight': 147.0,
    'Drug2_logP': 2.7,
    'Dosage_Drug1_mg': 250,
    'Dosage_Drug2_mg': 180
}
input_df = pd.DataFrame([input_data])
input_scaled = scaler.transform(input_df)
risk_prediction = model.predict(input_scaled)
print("QT Prolongation Risk Prediction:", "High Risk" if risk_prediction[0] == 1 else "Low Risk")
