In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [22]:
# Import data
df = pd.read_csv('/Users/lennardreihs/EPFL/Master/MA2/AI4chem/ai4chem/docs/data/cycpeptdb_clean.csv', header = 0) 
filtered_df = df[df['Permeability'] != -10]

# Select the desired columns
columns = ['TPSA', #topological polar surface area
           'MolWt', #molecular weight
           'NumHAcceptors', #number of hydrogen bond acceptors
           'NumHDonors', #number of hydrogen bond donors
           'NumHAcceptors'] + [col for col in filtered_df.columns if col.startswith('fr_')] 

# Create the feature matrix and target vector
X = filtered_df[columns].values
X = np.hstack((X, np.ones((X.shape[0], 1))))  # Add a column of ones for the bias term
y= filtered_df['Permeability'].values



# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Initialize a dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    # Store the results
    results[name] = {"MSE": mse, "MAE": mae, "R2": r2}

# Print the results
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  MAE: {metrics['MAE']}")
    print(f"  R2: {metrics['R2']}")
    print()
    
# Optionally, you can also evaluate the best model on the test set
# Here, we assume Gradient Boosting Regressor is the best model based on validation performance
best_model = models["Random Forest Regressor"]
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Test set evaluation (Gradient Boosting Regressor):")
print(f"  MSE: {test_mse}")
print(f"  MAE: {test_mae}")
print(f"  R2: {test_r2}")


Linear Regression:
  MSE: 0.4386944823603446
  MAE: 0.5015397236820028
  R2: 0.23783757007260453

Ridge Regression:
  MSE: 0.4391635922024334
  MAE: 0.5029112788421397
  R2: 0.23702256575519076

Lasso Regression:
  MSE: 0.5578388522980439
  MAE: 0.5747915821761139
  R2: 0.03084303023862489

Random Forest Regressor:
  MSE: 0.24331053675875677
  MAE: 0.3576866603522185
  R2: 0.5772863407689948

Gradient Boosting Regressor:
  MSE: 0.3030205942562033
  MAE: 0.4155815820428958
  R2: 0.4735495390921107

Test set evaluation (Gradient Boosting Regressor):
  MSE: 0.24088912669981027
  MAE: 0.3511303436513706
  R2: 0.5889692577436387


In [23]:
len(df)-len(filtered_df)

242