In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import OrthogonalMatchingPursuit

# Load data
train_data = pd.read_csv('./submissions/train_linear.csv')

def smiles_to_graph(smiles_list):
    graph_representations = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=1024) for smiles in smiles_list]
    return np.array(graph_representations)

train_graph_representations = smiles_to_graph(train_data["SMILES"])
feature_columns = train_data.columns.difference(["id", "SMILES", "MLM", "HLM"])
scaler = StandardScaler().fit(train_data[feature_columns])
normalized_features = scaler.transform(train_data[feature_columns])
combined_train_features = np.hstack([train_graph_representations, normalized_features])

train_features, val_features, train_labels, val_labels = train_test_split(combined_train_features, train_data[['MLM', 'HLM']].values, test_size=0.1, random_state=42)

# Train the OMP model
omp = OrthogonalMatchingPursuit()
omp.fit(train_features, train_labels)

# Predict on the validation set
val_predictions = omp.predict(val_features)

# Calculate RMSE
rmse_mlm = np.sqrt(mean_squared_error(val_labels[:, 0], val_predictions[:, 0]))
rmse_hlm = np.sqrt(mean_squared_error(val_labels[:, 1], val_predictions[:, 1]))

print(f"Validation RMSE for MLM: {rmse_mlm:.4f}")
print(f"Validation RMSE for HLM: {rmse_hlm:.4f}")

Validation RMSE for MLM: 33.5354
Validation RMSE for HLM: 33.6757


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [3]:
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.metrics import mean_squared_error

# Define a function to train and evaluate OMP for a given number of non-zero coefficients
def evaluate_omp_nnz(train_features, train_labels, val_features, val_labels, nnz):
    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=nnz)
    omp.fit(train_features, train_labels)
    val_predictions = omp.predict(val_features)
    rmse_mlm = np.sqrt(mean_squared_error(val_labels[:, 0], val_predictions[:, 0]))
    rmse_hlm = np.sqrt(mean_squared_error(val_labels[:, 1], val_predictions[:, 1]))
    return rmse_mlm, rmse_hlm

# Loop over a range of non-zero coefficients
for nnz in range(1, train_features.shape[1] + 1, 5):  # Adjust the step size based on your data size
    rmse_mlm, rmse_hlm = evaluate_omp_nnz(train_features, train_labels, val_features, val_labels, nnz)
    print(f"Number of non-zero coefficients: {nnz}, RMSE for MLM: {rmse_mlm:.4f}, RMSE for HLM: {rmse_hlm:.4f}")


Number of non-zero coefficients: 1, RMSE for MLM: 33.8520, RMSE for HLM: 34.4637
Number of non-zero coefficients: 6, RMSE for MLM: 32.6328, RMSE for HLM: 33.7453
Number of non-zero coefficients: 11, RMSE for MLM: 32.4031, RMSE for HLM: 33.7305
Number of non-zero coefficients: 16, RMSE for MLM: 32.0951, RMSE for HLM: 33.4474
Number of non-zero coefficients: 21, RMSE for MLM: 32.1994, RMSE for HLM: 33.0680
Number of non-zero coefficients: 26, RMSE for MLM: 32.3406, RMSE for HLM: 33.1229
Number of non-zero coefficients: 31, RMSE for MLM: 32.5825, RMSE for HLM: 33.4574
Number of non-zero coefficients: 36, RMSE for MLM: 32.7257, RMSE for HLM: 33.5281
Number of non-zero coefficients: 41, RMSE for MLM: 32.6770, RMSE for HLM: 33.2527
Number of non-zero coefficients: 46, RMSE for MLM: 32.7644, RMSE for HLM: 33.1460
Number of non-zero coefficients: 51, RMSE for MLM: 32.5912, RMSE for HLM: 33.1758
Number of non-zero coefficients: 56, RMSE for MLM: 32.6597, RMSE for HLM: 33.2407
Number of non-zero

In [None]:
# If you're satisfied with the results, you can then predict on the test set similarly
test_data = pd.read_csv('./submissions/test_linear.csv')
test_graph_representations = smiles_to_graph(test_data["SMILES"])
normalized_test_features = scaler.transform(test_data[feature_columns])
combined_test_features = np.hstack([test_graph_representations, normalized_test_features])

test_predictions = omp.predict(combined_test_features)

submission = pd.DataFrame({
    "id": test_data["id"],
    "MLM": test_predictions[:, 0],
    "HLM": test_predictions[:, 1]
})
submission.to_csv("./submissions/submission_OrthogonalMatchingPursuit.csv", index=False)
print("Submission file created!")