In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install rdkit 

In [None]:
# Overall
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# For Random Forest
from scipy.sparse import csr_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

# For Neural Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# For KNN
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [None]:
test_df = pd.read_csv('/kaggle/input/leash-BELKA/test.csv')

In [None]:
# Convert SMILES
def smiles_to_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits))

In [None]:
# Load train in smaller chunks
filename = '/kaggle/input/leash-BELKA/train.csv'
chunk_size = 100000  # Chunk Size - Tested 30k,100k,300k,500k - 1M (does not work)
nrows = 500000  # Number of rows to read

# Process data chunks
chunks = []
#pd.read_parquet('train.parquet')
for chunk in pd.read_csv(filename, chunksize=chunk_size, nrows=nrows):
    chunk['fingerprints'] = chunk['molecule_smiles'].apply(smiles_to_fingerprint)
    chunks.append(chunk)

# Concatenate all chunks into single DataFrame
train_df = pd.concat(chunks, ignore_index=True)

In [None]:
test_df.columns

In [None]:
train_df.columns

# Random Forest (Baseline Model)

In [None]:
# Separate features (X) and target variable (y)
X = np.array(train_df['fingerprints'].tolist())
y = train_df['binds'].values

# Filter out constants for X
non_constant_columns = np.any(X != X[0, :], axis=0)
X = X[:, non_constant_columns]

# Convert X to sparse matrix
X_sparse = csr_matrix(X)

# Split data 
X_train, X_val, y_train, y_val = train_test_split(X_sparse, y, test_size=0.2, random_state=42)

# Instantiate score function
k_best = SelectKBest(score_func=f_classif, k=10)  # Adjust K 

# Fit and transform
X_train_kbest = k_best.fit_transform(X_train, y_train)
X_val_kbest = k_best.transform(X_val)

# Train Model on reduced feature set
rf_class = RandomForestClassifier(n_estimators=100, random_state=42)
rf_class.fit(X_train_kbest, y_train)

# Predict on validation data
y_pred = rf_class.predict(X_val_kbest)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

# Neural Network(s)

In [None]:
# Separate features (X) and target variable (y)
X = np.array(train_df['fingerprints'].tolist())
y = train_df['binds'].values

# Filter out constants for X
non_constant_columns = np.any(X != X[0, :], axis=0)
X = X[:, non_constant_columns]

# Convert X to sparse matrix
X_sparse = csr_matrix(X)

# Split data 
X_train, X_val, y_train, y_val = train_test_split(X_sparse, y, test_size=0.2, random_state=42)

In [None]:
nn_model = Sequential([
    Dense(512, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile 
nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping - helps overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train 
train_nn = nn_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=512, callbacks=[early_stopping])

# Evaluate the model on the validation data
loss, accuracy = nn_model.evaluate(X_val, y_val)
print("Accuracy:", accuracy)

# KNN

In [None]:
# Separate features (X) and target variable (y)
X = np.array(train_df['fingerprints'].tolist())
y = train_df['binds'].values

# Balance w/ undersampling (?)
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train model
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(X_train, y_train)

# Predict
y_pred = knn.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_val, y_pred))