In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_excel('newdata/new_database.xlsx')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58996 entries, 0 to 58995
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Pressure (bar)       58996 non-null  float64
 1   Adsorption (mmol/g)  58996 non-null  float64
 2   zeolite_type         58996 non-null  object 
 3   adsorbate            58996 non-null  object 
 4   temperature          58996 non-null  int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 2.3+ MB


In [10]:
data = data[['temperature','Pressure (bar)', 'adsorbate', 'Adsorption (mmol/g)']]
data.head()

Unnamed: 0,temperature,Pressure (bar),adsorbate,Adsorption (mmol/g)
0,273,0.01702,co2,0.102298
1,273,0.018005,co2,0.10898
2,273,0.01899,co2,0.115643
3,273,0.019974,co2,0.122289
4,273,0.020959,co2,0.128916


In [14]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from scipy.stats import boxcox
# 准备数据
type_ohe = OneHotEncoder().fit_transform(data[['adsorbate']]).toarray()
temp,temp_lambda = boxcox(data['temperature'] + 1e-20)
pressure, pressure_lambda = boxcox(data['Pressure (bar)'] + 1e-20)
adsorption, adsorption_lambda = boxcox(data['Adsorption (mmol/g)'] + 1e-20)

# 标准化
scaler = StandardScaler()
data_normed = scaler.fit_transform(np.c_[temp, pressure, adsorption])

X = np.c_[data_normed[:,:2], type_ohe]
y = data_normed[:,2]

pd.DataFrame(np.c_[X, y.reshape(-1,1)], columns=['temp', 'pressure'] + [f'type{i+1}' for i in range(13)] + ['absorption']).head(10)

Unnamed: 0,temp,pressure,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,absorption
0,-0.613894,-1.384812,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.724225
1,-0.613894,-1.3657,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.684211
2,-0.613894,-1.347509,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.646487
3,-0.613894,-1.330151,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.610801
4,-0.613894,-1.313548,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.576937
5,-0.613894,-1.297636,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.544716
6,-0.613894,-1.282355,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.513981
7,-0.613894,-1.267655,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.484599
8,-0.613894,-1.253492,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.456453
9,-0.613894,-1.239826,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.42944


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(47196, 15) (11800, 15) (47196,) (11800,)


In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

lr = LinearRegression().fit(X_train, y_train)
pred1 = lr.predict(X_test)

lasso = MLPRegressor(hidden_layer_sizes=(32, 16, 8, 4), max_iter=1000).fit(X_train, y_train)
pred2 = lasso.predict(X_test)

rf = RandomForestRegressor().fit(X_train, y_train)
pred3 = rf.predict(X_test)

In [39]:
from sklearn.metrics import mean_squared_error

print('Linear Regression MSE:', mean_squared_error(y_test, pred1))
print('MLP MSE:', mean_squared_error(y_test, pred2))
print('Random Forest MSE:', mean_squared_error(y_test, pred3))

Linear Regression MSE: 0.2532388070563932
MLP MSE: 0.16778887067217663
Random Forest MSE: 0.32041794884476144
