In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load   
 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/leash-BELKA/sample_submission.csv
/kaggle/input/leash-BELKA/train.parquet
/kaggle/input/leash-BELKA/test.parquet
/kaggle/input/leash-BELKA/train.csv
/kaggle/input/leash-BELKA/test.csv


In [2]:
!pip install dask[complete] rdkit scikit-learn

[0m[31mERROR: Could not find a version that satisfies the requirement dask[complete] (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for dask[complete][0m[31m
[0m

In [None]:
!pip install dask

In [None]:
try:
    from rdkit import Chem
    from rdkit.Chem import AllChem
    print("RDKit is available.")
except ImportError:
    print("RDKit is not available.")


**Import Libraries and Define File Path**

In [None]:
import dask.dataframe as dd
from dask.delayed import delayed
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import os
import uuid

# Define the file path and columns to use
file_path = '/kaggle/input/leash-BELKA/train.parquet'
dask_df = dd.read_parquet(file_path)


**Load Data and Limit Samples**

In [None]:
def limit_data(df, n_samples_per_class=550000):
    total_binders = df[df['binds'] == 1].shape[0].compute()
    total_non_binders = df[df['binds'] == 0].shape[0].compute()
    
    print(f"Total binders: {total_binders}, Total non-binders: {total_non_binders}")
    
    # Ensure n_samples_per_class does not exceed the actual number of samples available
    n_samples_per_class = min(n_samples_per_class, total_binders, total_non_binders)
    
    binders = df[df['binds'] == 1].sample(frac=n_samples_per_class / total_binders, random_state=42)
    non_binders = df[df['binds'] == 0].sample(frac=n_samples_per_class / total_non_binders, random_state=42)
    
    return dd.concat([binders, non_binders])

limited_df = limit_data(dask_df)

# Debug: Check the size of the limited data
print("Size of limited_df:", limited_df.shape)

**Process each Chunk**

In [None]:
def process_chunk(chunk):
    chunk['molecule'] = chunk['molecule_smiles'].apply(Chem.MolFromSmiles)
    chunk['ecfp'] = chunk['molecule'].apply(lambda mol: list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)) if mol else [0]*1024)
    
    # Convert the protein_name column to a NumPy array
    protein_names = chunk['protein_name'].to_numpy()
    
    onehot_encoder = OneHotEncoder(sparse_output=False)
    if len(protein_names) > 0:
        protein_onehot = onehot_encoder.fit_transform(protein_names.reshape(-1, 1))
    else:
        protein_onehot = np.array([]).reshape(0, 0)
    
    # Combine ECFPs and one-hot encoded protein_name
    combined_features = [ecfp + list(protein) for ecfp, protein in zip(chunk['ecfp'].tolist(), protein_onehot.tolist())]
    
    # Convert combined_features to strings
    combined_features_str = [','.join(map(str, features)) for features in combined_features]
    
    return pd.DataFrame({'id': chunk['id'], 'features': combined_features_str, 'binds': chunk['binds']})

# Apply the processing function to each chunk
processed_df = limited_df.map_partitions(process_chunk, meta={'id': 'int64', 'features': 'object', 'binds': 'int64'})

# Generate a unique file name
unique_file_name = f'/kaggle/working/processed_dataset_{uuid.uuid4()}.parquet'

# Persist the processed DataFrame to disk
processed_df.to_parquet(unique_file_name, write_index=False, compression='snappy', engine='pyarrow')

print(f"Data processing complete. The processed file is saved as '{unique_file_name}'.")


**Load Processed Data**

In [None]:
print("something")

**Process Data and Write to Parquet**

In [None]:
# Load Processed Data
processed_df = pd.read_parquet(unique_file_name)

# Convert the features back to lists of floats
processed_df['features'] = processed_df['features'].apply(lambda x: list(map(float, x.split(','))))

# Debug: Check if processed_df is not empty
print("Size of processed_df:", processed_df.shape)
print(processed_df.head())


In [None]:
# Prepare Data for Model Train ing
# Ensure all feature vectors have the same length
max_length = max(processed_df['features'].apply(len))

def pad_features(features, max_length):
    return features + [0.0] * (max_length - len(features))

processed_df['features'] = processed_df['features'].apply(lambda x: pad_features(x, max_length))

X = np.array(processed_df['features'].tolist())
y = processed_df['binds'].tolist()

# Debug: Check the shapes of X and y
print("Shape of X:", X.shape)
print("Shape of y:", len(y))


**Prepare Data for Model Training**

**Split Data into Training and Testing Sets**

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debug: Check the shapes of the splits
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", len(y_train))
print("Shape of y_test:", len(y_test))

**Train the Random Forest Model**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Debug: Check if the model is trained
print("Model training completed.")


**Make Predictions and Evaluate the Model**

In [None]:
from sklearn.metrics import average_precision_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the mean average precision
map_score = average_precision_score(y_test, y_pred_proba)
print(f"Mean Average Precision (mAP): {map_score:.2f}")

# Make binary predictions
y_pred = rf_model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


**Save the Trained Model (Optional)**

In [None]:
import joblib

# Save the trained model to a file
model_path = 'random_forest_model.pkl'
joblib.dump(rf_model, model_path)
print(f"Model saved to {model_path}")


**Split Data into Training and Testing Sets**

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debug: Check the shapes of the splits
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", len(y_train))
print("Shape of y_test:", len(y_test))


**Train the Random Forest Model**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Debug: Check if the model is trained
print("Model training completed.")


**Make Predictions and Evaluate the Model**

In [None]:
from sklearn.metrics import average_precision_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the mean average precision
map_score = average_precision_score(y_test, y_pred_proba)
print(f"Mean Average Precision (mAP): {map_score:.2f}")

# Make binary predictions
y_pred = rf_model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


**Save the Trained Model (Optional)**

In [None]:
import joblib

# Save the trained model to a file
model_path = 'random_forest_model.pkl'
joblib.dump(rf_model, model_path)
print(f"Model saved to {model_path}")


**starting with test data**

Load the Test Data

In [None]:
import dask.dataframe as dd

# Define the file path for test data
test_path = '/kaggle/input/leash-BELKA/test.parquet'

# Load the Parquet file using Dask
dask_test_df = dd.read_parquet(test_path)

# Display the shape of the test data
print(f"Shape of test data: {dask_test_df.shape}")


Process Each Chunk

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

# Define a function to process each chunk
def process_test_chunk(chunk):
    chunk['molecule'] = chunk['molecule_smiles'].apply(Chem.MolFromSmiles)
    chunk['ecfp'] = chunk['molecule'].apply(lambda mol: list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)) if mol else [0]*1024)
    
    # Convert the protein_name column to a NumPy array
    protein_names = chunk['protein_name'].to_numpy()
    
    onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    if len(protein_names) > 0:
        protein_onehot = onehot_encoder.fit_transform(protein_names.reshape(-1, 1))
    else:
        protein_onehot = np.array([]).reshape(0, 0)
    
    # Combine ECFPs and one-hot encoded protein_name
    combined_features = [ecfp + list(protein) for ecfp, protein in zip(chunk['ecfp'].tolist(), protein_onehot.tolist())]
    
    # Convert combined_features to strings
    combined_features_str = [','.join(map(str, features)) for features in combined_features]
    
    return pd.DataFrame({'id': chunk['id'], 'features': combined_features_str})

# Apply the processing function to each chunk
processed_test_df = dask_test_df.map_partitions(process_test_chunk, meta={'id': 'int64', 'features': 'object'})

# Persist the processed DataFrame to disk
processed_test_df.to_parquet('processed_test_dataset.parquet', write_index=False, compression='snappy', engine='pyarrow')

print("Test data processing complete. The processed test file is saved as 'processed_test_dataset.parquet'.")


**Load and Verify Processed Data**

In [None]:
# Load the processed test data
processed_test_df = pd.read_parquet('processed_test_dataset.parquet')

# Display the shape of the processed test data
print(f"Shape of processed test data: {processed_test_df.shape}")

# Display the first few rows of the processed test data
print(processed_test_df.head())


**Prepare Features for Model Prediction**

In [None]:
# Convert the features back to lists of floats
processed_test_df['features'] = processed_test_df['features'].apply(lambda x: list(map(float, x.split(','))))

# Debug: Check if processed_test_df is not empty
print("Size of processed_test_df:", processed_test_df.shape)
print(processed_test_df.head())

# Ensure all feature vectors have the same length
max_length = max(processed_test_df['features'].apply(len))

def pad_features(features, max_length):
    return features + [0.0] * (max_length - len(features))

processed_test_df['features'] = processed_test_df['features'].apply(lambda x: pad_features(x, max_length))

# Convert the processed DataFrame to a NumPy array for prediction
X_test = np.array(processed_test_df['features'].tolist())
test_ids = processed_test_df['id'].values

# Display the shape of the features
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of test_ids: {test_ids.shape}")


**Load the Trained Model and Make Predictions**

In [None]:
import joblib

# Load the trained model
rf_model = joblib.load('/kaggle/working/random_forest_model.pkl')

# Make predictions on the test set
y_test_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Prepare the submission DataFrame
submission_df = pd.DataFrame({'id': test_ids, 'prediction': y_test_pred_proba})

# Save the submission to a CSV file
submission_df.to_csv('/kaggle/working/test_predictions.csv', index=False)

print("Test predictions saved to 'test_predictions.csv'.")
