In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Load data
train_data = pd.read_csv('./submissions/train_linear.csv')

# Extract SMILES strings and convert to molecular graphs
def smiles_to_graph(smiles_list):
    graph_representations = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=1024) for smiles in smiles_list]
    return np.array(graph_representations)

train_graph_representations = smiles_to_graph(train_data["SMILES"])
feature_columns = train_data.columns.difference(["id", "SMILES", "MLM", "HLM"])

# Normalize other features from the dataset
scaler = StandardScaler().fit(train_data[feature_columns])
normalized_features = scaler.transform(train_data[feature_columns])

# Combine molecular representations and normalized features
combined_train_features = np.hstack([train_graph_representations, normalized_features])

# Split data into training and validation sets
train_features, val_features, train_labels, val_labels = train_test_split(combined_train_features, train_data[['MLM', 'HLM']].values, test_size=0.1, random_state=42)

# Separate features and targets for training data
X_train = train_features
y_train_mlm = train_labels[:, 0]
y_train_hlm = train_labels[:, 1]

# Separate features and targets for validation data
X_val = val_features
y_val_mlm = val_labels[:, 0]
y_val_hlm = val_labels[:, 1]

# Set up and train the model for MLM
model_mlm = xgb.XGBRegressor(objective ='reg:squarederror')  # default hyperparameters
model_mlm.fit(X_train, y_train_mlm)

# Predict on validation data for MLM
val_predictions_mlm = model_mlm.predict(X_val)
rmse_mlm = np.sqrt(mean_squared_error(y_val_mlm, val_predictions_mlm))
print(f"Validation RMSE for MLM: {rmse_mlm:.4f}")

# Set up and train the model for HLM
model_hlm = xgb.XGBRegressor(objective ='reg:squarederror')  # default hyperparameters
model_hlm.fit(X_train, y_train_hlm)

# Predict on validation data for HLM
val_predictions_hlm = model_hlm.predict(X_val)
rmse_hlm = np.sqrt(mean_squared_error(y_val_hlm, val_predictions_hlm))
print(f"Validation RMSE for HLM: {rmse_hlm:.4f}")

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Validation RMSE for MLM: 33.2297
Validation RMSE for HLM: 35.7234


In [None]:
# Load test data, featurize, and make predictions
test_data = pd.read_csv('./submissions/test_linear.csv')
test_graph_representations = smiles_to_graph(test_data["SMILES"])
normalized_test_features = scaler.transform(test_data[feature_columns])
combined_test_features = np.hstack([test_graph_representations, normalized_test_features])

# Predict on test data
test_predictions_mlm = model_mlm.predict(combined_test_features)
test_predictions_hlm = model_hlm.predict(combined_test_features)

submission = pd.DataFrame({
    "id": test_data["id"],
    "MLM": test_predictions_mlm,
    "HLM": test_predictions_hlm
})
submission.to_csv("submission_xgboost.csv", index=False)
print("Submission file created!")