In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Import data
df = pd.read_csv('/Users/lennardreihs/EPFL/Master/MA2/AI4chem/ai4chem/docs/data/cycpeptdb_clean.csv', header = 0) 
filtered_df = df[df['Permeability'] != -10]

# Select the desired columns
columns = ['TPSA', #topological polar surface area
           'MolWt', #molecular weight
           'NumHAcceptors', #number of hydrogen bond acceptors
           'NumHDonors', #number of hydrogen bond donors
           'NumHAcceptors'] + [col for col in filtered_df.columns if col.startswith('fr_')] 

# Create the feature matrix and target vector
X = filtered_df[columns].values
X = np.hstack((X, np.ones((X.shape[0], 1))))  # Add a column of ones for the bias term
y= filtered_df['Permeability'].values



# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Initialize a dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    # Store the results
    results[name] = {"MSE": mse, "MAE": mae, "R2": r2}

# Print the results
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  MAE: {metrics['MAE']}")
    print(f"  R2: {metrics['R2']}")
    print()
    
# Optionally, you can also evaluate the best model on the test set
# Here, we assume Gradient Boosting Regressor is the best model based on validation performance
best_model = models["Random Forest Regressor"]
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Test set evaluation (Gradient Boosting Regressor):")
print(f"  MSE: {test_mse}")
print(f"  MAE: {test_mae}")
print(f"  R2: {test_r2}")


In [None]:
len(df)-len(filtered_df)

In [None]:
# Import data
df = pd.read_csv('/home/colleen/projects/ai4chem/docs/data/cycpeptdb_clean_fps.csv', header = 0) 
filtered_df = df[df['Permeability'] != -10]
filtered_df = filtered_df.drop('SMILES', axis=1)

# Select the desired columns
columns = ['TPSA', #topological polar surface area
           'MolWt', #molecular weight
           'NumHAcceptors', #number of hydrogen bond acceptors
           'NumHDonors', #number of hydrogen bond donors
           'NumHAcceptors'] + [col for col in filtered_df.columns if col.startswith('fr_')] 

# Create the feature matrix and target vector
X = filtered_df[columns].values
X = np.hstack((X, np.ones((X.shape[0], 1))))  # Add a column of ones for the bias term
y= filtered_df['Permeability'].values



# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Initialize a dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    # Store the results
    results[name] = {"MSE": mse, "MAE": mae, "R2": r2}

# Print the results
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  MAE: {metrics['MAE']}")
    print(f"  R2: {metrics['R2']}")
    print()
    
# Optionally, you can also evaluate the best model on the test set
# Here, we assume Gradient Boosting Regressor is the best model based on validation performance
best_model = models["Random Forest Regressor"]
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Test set evaluation (Gradient Boosting Regressor):")
print(f"  MSE: {test_mse}")
print(f"  MAE: {test_mae}")
print(f"  R2: {test_r2}")


In [6]:
# Import data
df = pd.read_csv('/home/colleen/projects/ai4chem/docs/data/cycpeptdb_clean_fps.csv', header = 0) 
filtered_df = df[df['Permeability'] != -10]
filtered_df = filtered_df.drop('SMILES', axis=1)

# Select the desired columns
columns = ['TPSA', #topological polar surface area
           'MolWt', #molecular weight
           'NumHAcceptors', #number of hydrogen bond acceptors
           'NumHDonors', #number of hydrogen bond donors
           'NumHAcceptors'] + [col for col in filtered_df.columns if col.startswith('fr_')] 

# Create the feature matrix and target vector
X = filtered_df[columns].values
X = np.hstack((X, np.ones((X.shape[0], 1))))  # Add a column of ones for the bias term
y= filtered_df['Permeability'].values



# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Initialize a dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    # Store the results
    results[name] = {"MSE": mse, "MAE": mae, "R2": r2}

# Print the results
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  MAE: {metrics['MAE']}")
    print(f"  R2: {metrics['R2']}")
    print()
    
# Optionally, you can also evaluate the best model on the test set
# Here, we assume Gradient Boosting Regressor is the best model based on validation performance
best_model = models["Random Forest Regressor"]
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Test set evaluation (Gradient Boosting Regressor):")
print(f"  MSE: {test_mse}")
print(f"  MAE: {test_mae}")
print(f"  R2: {test_r2}")

Linear Regression:
  MSE: 0.43869448236034647
  MAE: 0.5015397236820011
  R2: 0.2378375700726012

Ridge Regression:
  MSE: 0.4391635922024211
  MAE: 0.5029112788421326
  R2: 0.23702256575521208

Lasso Regression:
  MSE: 0.5578388522980439
  MAE: 0.5747915821761139
  R2: 0.03084303023862489

Random Forest Regressor:
  MSE: 0.24323047027635178
  MAE: 0.3574357551955454
  R2: 0.5774254436463713

Gradient Boosting Regressor:
  MSE: 0.3030205942562033
  MAE: 0.4155815820428958
  R2: 0.4735495390921107

Test set evaluation (Gradient Boosting Regressor):
  MSE: 0.24055753516565626
  MAE: 0.35113056806348003
  R2: 0.5895350546157371


In [9]:
dup = df['fingerprint'].duplicated()
nb_dup = sum(dup)
print(nb_dup)

4521


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Import data
df = pd.read_csv('/home/colleen/projects/ai4chem/docs/data/cycpeptdb_clean_fps.csv', header=0)
filtered_df = df[df['Permeability'] != -10]
filtered_df = filtered_df.drop('SMILES', axis=1)

# Select the desired columns
columns = ['TPSA', 'MolWt', 'NumHAcceptors', 'NumHDonors'] + [col for col in filtered_df.columns if col.startswith('fr_')]

# Create the feature matrix and target vector
X = filtered_df[columns].values
X = np.hstack((X, np.ones((X.shape[0], 1))))  # Add a column of ones for the bias term
y = filtered_df['Permeability'].values

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Initialize a dictionary to store results
results = {}
feature_weights = {col: [] for col in columns + ['Bias']}  # Initialize a dictionary for feature weights

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    # Store the results
    results[name] = {"MSE": mse, "MAE": mae, "R2": r2}
    
    # Extract feature weights or importances
    if hasattr(model, 'coef_'):
        weights = model.coef_
    elif hasattr(model, 'feature_importances_'):
        weights = model.feature_importances_
    else:
        weights = np.zeros(len(columns) + 1)  # Default to zero if the model doesn't support feature importances

    # Append the feature weights to the dictionary
    for i, col in enumerate(columns + ['Bias']):
        feature_weights[col].append(weights[i])

# Create a DataFrame for feature weights
feature_weights_df = pd.DataFrame(feature_weights, index=models.keys()).T

# Sort the DataFrame by the absolute sum of weights for each feature across all models
feature_weights_df['SumAbsWeights'] = feature_weights_df.abs().sum(axis=1)
feature_weights_df = feature_weights_df.sort_values(by='SumAbsWeights', ascending=False)
feature_weights_df = feature_weights_df.drop(columns='SumAbsWeights')

# Print the feature weights DataFrame
print(feature_weights_df)

# Print the results
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  MAE: {metrics['MAE']}")
    print(f"  R2: {metrics['R2']}")
    print()

# Optionally, you can also evaluate the best model on the test set
# Here, we assume Gradient Boosting Regressor is the best model based on validation performance
best_model = models["Random Forest Regressor"]
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Test set evaluation (Random Forest Regressor):")
print(f"  MSE: {test_mse}")
print(f"  MAE: {test_mae}")
print(f"  R2: {test_r2}")


              Linear Regression  Ridge Regression  Lasso Regression  \
fr_guanido            -4.329978         -0.282532              -0.0   
fr_priamide           -2.012878         -1.729224              -0.0   
fr_thiophene           2.506233          0.658698               0.0   
fr_HOCCN              -1.703406         -1.334704              -0.0   
fr_amide              -2.431845         -0.012600               0.0   
...                         ...               ...               ...   
fr_isocyan             0.000000          0.000000               0.0   
fr_hdrzone             0.000000          0.000000               0.0   
fr_hdrzine             0.000000          0.000000               0.0   
fr_furan               0.000000          0.000000               0.0   
Bias                   0.000000          0.000000               0.0   

              Random Forest Regressor  Gradient Boosting Regressor  
fr_guanido                   0.000030                     0.000060  
fr_priami

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Import data
df = pd.read_csv('/home/colleen/projects/ai4chem/docs/data/cycpeptdb_clean_fps.csv', header=0)
filtered_df = df[df['Permeability'] != -10]
filtered_df = filtered_df.drop('SMILES', axis=1)

# Select the desired columns
columns = ['TPSA', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'NumHAcceptors'] + [col for col in filtered_df.columns if col.startswith('fr_')]

# Create the feature matrix and target vector
X = filtered_df[columns].values
X = np.hstack((X, np.ones((X.shape[0], 1))))  # Add a column of ones for the bias term
y = filtered_df['Permeability'].values

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Initialize a dictionary to store results
results = {}
feature_importances = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    # Store the results
    results[name] = {"MSE": mse, "MAE": mae, "R2": r2}
    
    # Store feature importances or coefficients
    if hasattr(model, 'coef_'):
        feature_importances[name] = model.coef_
    elif hasattr(model, 'feature_importances_'):
        feature_importances[name] = model.feature_importances_

# Print the results
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  MAE: {metrics['MAE']}")
    print(f"  R2: {metrics['R2']}")
    print()

# Print feature importances
for name, importances in feature_importances.items():
    print(f"{name} feature importances:")
    for feature, importance in zip(columns, importances):
        print(f"  {feature}: {importance}")
    print()

# Optionally, you can also evaluate the best model on the test set
# Here, we assume Gradient Boosting Regressor is the best model based on validation performance
best_model = models["Random Forest Regressor"]
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Test set evaluation (Random Forest Regressor):")
print(f"  MSE: {test_mse}")
print(f"  MAE: {test_mae}")
print(f"  R2: {test_r2}")



Linear Regression:
  MSE: 0.43869448236034647
  MAE: 0.5015397236820011
  R2: 0.2378375700726012

Ridge Regression:
  MSE: 0.4391635922024211
  MAE: 0.5029112788421326
  R2: 0.23702256575521208

Lasso Regression:
  MSE: 0.5578388522980439
  MAE: 0.5747915821761139
  R2: 0.03084303023862489

Random Forest Regressor:
  MSE: 0.24323047027635178
  MAE: 0.3574357551955454
  R2: 0.5774254436463713

Gradient Boosting Regressor:
  MSE: 0.3030205942562033
  MAE: 0.4155815820428958
  R2: 0.4735495390921107

Linear Regression feature importances:
  TPSA: 0.07581458441104796
  MolWt: 0.00047569759001329925
  NumHAcceptors: -0.8403284551516148
  NumHDonors: -0.019689233180438193
  NumHAcceptors: -0.8403284551514915
  fr_Al_COO: -0.7370548163827387
  fr_Al_OH: -0.18747274465387453
  fr_Al_OH_noTert: 0.6641055228708839
  fr_ArN: 2.886579864025407e-14
  fr_Ar_COO: 7.549516567451064e-15
  fr_Ar_N: -0.3246988954564895
  fr_Ar_NH: -0.7098686561242649
  fr_Ar_OH: -0.18030092619836058
  fr_COO: -0.73705481