In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_excel('database.xlsx')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Temp                      1209 non-null   int64  
 1   Pressure(Bar)             1209 non-null   float64
 2   total_adsorption(mmol/g)  1209 non-null   float64
 3   PLD                       1076 non-null   float64
 4   LCD                       1076 non-null   float64
 5   Density(g/cm^3)           1076 non-null   float64
 6   VSA(m^2/cm^3)             1076 non-null   float64
 7   GSA(m^2/g)                1076 non-null   float64
 8   Vp(cm^3/g)                1076 non-null   float64
 9   Void_fraction             1076 non-null   float64
 10  E                         1076 non-null   float64
 11  Type                      1209 non-null   int64  
 12  zeolite_type              1209 non-null   object 
dtypes: float64(10), int64(2), object(1)
memory usage: 122.9+ KB


In [6]:
data = data[['Temp','Pressure(Bar)', 'Type', 'total_adsorption(mmol/g)']]
data.head()

Unnamed: 0,Temp,Pressure(Bar),Type,total_adsorption(mmol/g)
0,293,0.05829,2,0.1634
1,293,0.08167,2,0.2447
2,293,0.10265,2,0.3032
3,293,0.11245,2,0.3392
4,293,0.20375,2,0.6075


In [57]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from scipy.stats import boxcox
# 准备数据
type_ohe = OneHotEncoder().fit_transform(data[['Type']]).toarray()
temp,temp_lambda = boxcox(data['Temp'] + 1e-20)
pressure, pressure_lambda = boxcox(data['Pressure(Bar)'] + 1e-20)
adsorption, adsorption_lambda = boxcox(data['total_adsorption(mmol/g)'] + 1e-20)

# 标准化
scaler = StandardScaler()
data_normed = scaler.fit_transform(np.c_[temp, pressure, adsorption])

X = np.c_[data_normed[:,:2], type_ohe]
y = data_normed[:,2]

pd.DataFrame(np.c_[X, y.reshape(-1,1)], columns=['temp', 'pressure'] + [f'type{i+1}' for i in range(9)] + ['absorption']).head(10)

Unnamed: 0,temp,pressure,type1,type2,type3,type4,type5,type6,type7,type8,type9,absorption
0,-0.451292,-0.253084,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.037681
1,-0.451292,-0.136523,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.813687
2,-0.451292,-0.054559,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.684157
3,-0.451292,-0.021189,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.613216
4,-0.451292,0.206257,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.206906
5,-0.451292,0.369745,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073073
6,-0.451292,0.492539,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.312663
7,-0.451292,0.588382,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.485382
8,-0.451292,0.671963,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.639817
9,-0.451292,0.744173,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.768884


In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(967, 11) (242, 11) (967,) (242,)


In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

lr = LinearRegression().fit(X_train, y_train)
pred1 = lr.predict(X_test)

lasso = MLPRegressor(hidden_layer_sizes=(8, 8, 4), max_iter=1000).fit(X_train, y_train)
pred2 = lasso.predict(X_test)

rf = RandomForestRegressor().fit(X_train, y_train)
pred3 = rf.predict(X_test)

In [60]:
from sklearn.metrics import mean_squared_error

print('Linear Regression MSE:', mean_squared_error(y_test, pred1))
print('MLP MSE:', mean_squared_error(y_test, pred2))
print('Random Forest MSE:', mean_squared_error(y_test, pred3))

Linear Regression MSE: 0.3204842726019444
MLP MSE: 0.2652469250329643
Random Forest MSE: 0.32230697462633484
