# Boiling Point of Compounds
Based on:
 1. S.-Y. Kim, I. Jeon and S.-J. Kang, "Integrating Data Science and Machine Learning to Chemistry Education: Predicting Classification and Boiling Point of Compounds", J. Chem. Educ., 101, 1771–1776 (2024). [doi:10.1021/acs.jchemed.3c01040](doi.org/10.1021/acs.jchemed.3c01040)


In [1]:
from pylab import *
import pandas as pd
import sklearn

## Functions

In [None]:
def plot_fun(bps, bp_predictions, t=None):
    plot(bps, bp_predictions,'o')
    plot(bps,bps)
    axis('square')
    xlabel('Boiling Point')
    ylabel('Predicted Boiling Point')
    title(t)
    show()

## Import data

In [4]:
data_url = 'https://raw.githubusercontent.com/woldr001/AIChE_Workshop_MSU/refs/heads/main/boiling_point_data.csv'
compounds = pd.read_csv(data_url)
compounds.head()

Unnamed: 0,cmpdname,BoilingPoint,mw,mf,polararea,heavycnt,hbondacc,isosmiles,C number,N number,O number,Side chain number,Double bond number,Triple bond number,Classify1
0,Ethylene Glycol,470.65,62.07,C2H6O2,40.5,4,2,C(CO)O,2,0,2,1,0,0,Alcohol
1,"2,3-Butanediol",455.15,90.12,C4H10O2,40.5,6,2,CC(C(C)O)O,4,0,2,2,0,0,Alcohol
2,1-Butanol,390.75,74.12,C4H10O,20.2,5,1,CCCCO,4,0,1,0,0,0,Alcohol
3,Methane,111.65,16.043,CH4,0.0,1,0,C,1,0,0,0,0,0,Hydrocarbon
4,Octane,398.77,114.23,C8H18,0.0,8,0,CCCCCCCC,8,0,0,0,0,0,Hydrocarbon


### Import clean data

In [None]:
compounds = pd.read_excel('ed3c01040_si_002.xlsx')
compounds.head()

### b) Delete unnecessary compounds

### C. DATA Classification

In [None]:
compounds["Classify1"].value_counts()

## Training Set and Test Set

In [None]:
def split_data_with_id_hash(data, test_ratio, id_column):

    from zlib import crc32

    def is_id_in_test_set(identifier, test_ratio):
      return crc32(np.int64(identifier)) < test_ratio * 2**32

    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
compounds = compounds.reset_index()
train_set, test_set = split_data_with_id_hash(compounds, 0.2, "index")

In [None]:
len(train_set), len(test_set)

## Data Exploration

### Histograms

In [None]:
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

train_set.hist(bins=50, figsize=(12, 8))
plt.show()

### Correlations

In [None]:
cols = ['BoilingPoint', 'mw', 'polararea', 'heavycnt',
       'hbondacc', 'C number', 'N number', 'O number',
       'Side chain number', 'Double bond number', 'Triple bond number']
corr_matrix = train_set[cols].corr()
corr_matrix

### Scatter Plot

In [None]:
from pandas.plotting import scatter_matrix
cols = ['BoilingPoint', 'mw', 'polararea', 'heavycnt',
        'C number', 'N number', 'O number',
       'Side chain number', 'Double bond number',]
scatter_matrix(train_set[cols], figsize=(12, 8))
show()

## Prepare for ML

### define labels

In [None]:
compounds=train_set.drop('BoilingPoint', axis=1)
bps = train_set['BoilingPoint'].copy()

### Encode category

In [None]:
compounds_category = compounds[["Classify1"]]
compounds_category.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse_output=False)
compounds_cat_1hot = cat_encoder.fit_transform(compounds_category)
compounds_cat_1hot[:10]

In [None]:
cat_encoder.categories_

### Feature Scaling

We'll skip this for now...

In [None]:
sklearn.set_config(display='diagram')

### Create Pipeline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

num_pipeline

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = ['mw', 'polararea', 'heavycnt',
       'hbondacc', 'C number', 'N number', 'O number',
       'Side chain number', 'Double bond number', 'Triple bond number']
cat_attribs = ["Classify1"]

cat_pipeline = make_pipeline(
  SimpleImputer(strategy="most_frequent"),
  OneHotEncoder(handle_unknown="ignore"))

preprocessing = ColumnTransformer([
  ("num", num_pipeline, num_attribs),
  ("cat", cat_pipeline, cat_attribs),
])

preprocessing

In [None]:
compounds_prepared = preprocessing.fit_transform(compounds)
compounds_prepared

In [None]:
preprocessing.get_feature_names_out()

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(compounds, bps)

In [None]:
bp_predictions = lin_reg.predict(compounds)
bp_predictions[:5]

In [None]:
bps.iloc[:5]

In [None]:
from sklearn.metrics import root_mean_squared_error
lin_rmse = root_mean_squared_error(bps, bp_predictions)
lin_rmse

In [None]:
plot_fun(bps, bp_predictions, 'Linear Regression')

### Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

lin_rmses = -cross_val_score(lin_reg, compounds, bps,
                            scoring="neg_root_mean_squared_error", cv=10)

In [None]:
lin_rmses

In [None]:
pd.Series(lin_rmses).describe()

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(compounds, bps)

In [None]:
bp_predictions = tree_reg.predict(compounds)
tree_rmse = root_mean_squared_error(bps, bp_predictions)
tree_rmse

In [None]:
plot_fun(bps, bp_predictions, 'Decision Tree')

### Cross-Validation

In [None]:
tree_rmses = -cross_val_score(tree_reg, compounds, bps,
                            scoring="neg_root_mean_squared_error", cv=10)

In [None]:
tree_rmses

In [None]:
pd.Series(tree_rmses).describe()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing,
                       RandomForestRegressor(random_state=42))

forest_reg.fit(compounds, bps)

bp_predictions = forest_reg.predict(compounds)

forest_rmses = -cross_val_score(forest_reg, compounds, bps,
                            scoring="neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(forest_rmses).describe()

In [None]:
plot_fun(bps, bp_predictions, 'Random Forest')

### Feature Importance

In [None]:
feature_importances = forest_reg['randomforestregressor'].feature_importances_
feature_names = preprocessing.get_feature_names_out()

In [None]:
barh(feature_names, feature_importances)

## Compare to Test Data

In [None]:
test_bps = test_set['BoilingPoint'].copy()
test_predictions = forest_reg.predict(test_set)

In [None]:
plot_fun(test_bps, test_predictions, 'Random Forest - Test Set')

In [None]:
final_rmse = root_mean_squared_error(test_bps, test_predictions)
print(final_rmse)

## Neural Network
Referred to as a Multi-layer Perceptron (MLP) in Scikit-learn

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
mlp_regressor = MLPRegressor(solver='lbfgs', max_iter=1000)

MLP_reg = make_pipeline(preprocessing, mlp_regressor )
MLP_reg.fit(compounds, bps)

bp_predictions = MLP_reg.predict(compounds)

MLP_rmses = -cross_val_score(MLP_reg, compounds, bps,
                            scoring="neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(MLP_rmses).describe()

In [None]:
plot(bps, bp_predictions,'o')
plot(bps,bps)
axis('square')
xlabel('Boiling Point')
ylabel('Predicted Boiling Point')
title('Random Forest')
show()