### Imports

#######################################
# SMOTE for DTI Project
# Applied to: Enzyme Family
# Generic for all 4 families (Enzyme, GPCR, Ion Channel, Nuclear Receptor)
# Author: Vitor M. Silva
# Date: 2025-04-30
#######################################
# Documentation
#######################################
'''
# SMOTE (Synthetic Minority Over-sampling Technique) for DTI (Drug-Target Interaction)

## Purpose
SMOTE is an advanced over-sampling technique that creates **synthetic minority class examples** by interpolating between existing positive samples.

In DTI prediction:
- **Positives (class 1)**: Known drug-target interactions (rare).
- **Negatives (class 0)**: Assumed non-interactions (abundant).

Rather than duplicating existing positives, SMOTE generates **new positive vectors** by combining feature values from neighbors in latent space.

## Algorithm Steps
- Load the dataset (latent features + class).
- Apply SMOTE to generate synthetic class 1 examples until class balance is achieved.
- Concatenate synthetic data with the original dataset.
- Save the augmented dataset for model training.
'''

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras
from sklearn.utils import resample


from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from collections import Counter





# Use the ltype variable below to select the family 

In [18]:
# Set base directory
base_dir = os.getcwd()

ligands_type = ['enzyme', 'GPCR', 'ion_channel', 'nuclear_receptor']
method_type = {
    0: 'RandomOver',
    1: 'RandomUnder',
    2: 'TomekLinks',
    3: 'CNNTomekLinks',
    4: 'SMOTE',
    5: 'SMOTETomekLinks',
    6: 'BasicSMOTE',
    7: 'ADASYN',
    8: 'SMOTENC',
    9: 'KMeansSMOTE',
    10: 'LatentSampling',
    11: 'NoiseInjection',
    12: 'LaplacianWNN'
}

ltype_index = 3  # Change to 1, 2, or 3 for other families
method_index = 4

ltype = ligands_type[ltype_index]
model_tag = method_type[method_index]

# File paths
original_file_name = 'final_new_par_50.csv'
file_path = os.path.join(base_dir, 'data', 'split', ltype, original_file_name)

# Load the data
print(f"Loading data for: {ltype}")
data_frame = pd.read_csv(file_path, header=None, skiprows=1)

Loading data for: nuclear_receptor


In [19]:
# Split features and labels
features = data_frame.iloc[:, :-1].values  # First 100 columns: 50 drug features + 50 target features
labels = data_frame.iloc[:, -1].values     # Last column: label (0 or 1)

# Separate positive and negative samples
positive_features = features[labels == 1]
negative_features = features[labels == 0]

print(f"Original positives: {len(positive_features)}")
print(f"Original negatives: {len(negative_features)}")

Original positives: 90
Original negatives: 1314


In [None]:
# Scale features before SMOTE
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Apply SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(features_scaled, labels)

# Inverse scaling to return features to original scale
X_resampled_original_scale = scaler.inverse_transform(X_resampled)

In [21]:
# Combine and save
enhanced_data = np.hstack((X_resampled_original_scale, y_resampled.reshape(-1, 1)))
enhanced_df = pd.DataFrame(enhanced_data)

In [22]:
# Save enhanced dataset
enhanced_file_name = f'enhanced_{model_tag}_final_new_par_50.csv'
enhanced_file_path = os.path.join(base_dir, 'data', 'split', ltype, enhanced_file_name)
enhanced_df.to_csv(enhanced_file_path, index=False)



In [23]:
print(f"Enhanced dataset saved at: {enhanced_file_path}")
print(f"New dataset size: {enhanced_df.shape}")

Enhanced dataset saved at: C:\Users\riskf\OneDrive\DTI - Data augmentation\data\split\nuclear_receptor\enhanced_SMOTE_final_new_par_50.csv
New dataset size: (2628, 101)


In [24]:
# Separate positive and negative samples
positive_features_enhanced = enhanced_df[enhanced_df[100] == 1]
negative_features_enhanced = enhanced_df[enhanced_df[100] == 0]

print(f"Original positives: {len(positive_features_enhanced)}")
print(f"Original negatives: {len(negative_features_enhanced)}")


Original positives: 1314
Original negatives: 1314
