In [31]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

df = pd.read_csv('result/data/melting_point_features.csv')

y = df['Tm']
X = df.drop(columns=['Tm'])

X = X.select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy='median')
X_clean = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

base_model = LGBMRegressor(random_state=42, n_jobs=-1)

In [32]:
from sklearn.feature_selection import RFE
import time

print("\n---START RFE ---")
start = time.time()

model = LGBMRegressor(n_jobs=1, verbose=-1)

rfe = RFE(estimator=model, n_features_to_select=30, step=0.1)

rfe.fit(X_train, y_train)

selected_rfe = X_train.columns[rfe.support_]
print(f"‚è±Ô∏è Time Run: {time.time() - start:.2f} s")
print(f"‚úÖ RFE Choosen {len(selected_rfe)} features:")
print(list(selected_rfe))


---START RFE ---
‚è±Ô∏è Time Run: 12.56 s
‚úÖ RFE Choosen 30 features:
['MinEStateIndex', 'qed', 'SPS', 'MolWt', 'NumValenceElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MinAbsPartialCharge', 'BCUT2D_MRHI', 'BalabanJ', 'Chi0n', 'Chi0v', 'HallKierAlpha', 'Kappa2', 'Kappa3', 'PEOE_VSA14', 'SMR_VSA10', 'SlogP_VSA2', 'TPSA', 'EState_VSA2', 'Phi', 'MolMR', 'SlogP_VSA0', 'SMR_VSA0', 'Gasteiger_q_std', 'HeteroAtomFrac', 'Flexibility_Score', 'Complexity_per_MW', 'FracSingle', 'FracDouble']


In [33]:
import warnings
warnings.filterwarnings('ignore')
from sklearn_genetic import GAFeatureSelectionCV

print("\n--- üß¨ START RUN GENETIC ALGORITHM ---")
model = LGBMRegressor(n_jobs=1, verbose=-1)

ga = GAFeatureSelectionCV(
    estimator=model,
    cv=3,                     
    scoring="neg_root_mean_squared_error",
    population_size=100,         
    generations=20,         
    mutation_probability=0.3,
    n_jobs=2,
    verbose=True              
)

ga.fit(X_train, y_train)

selected_ga = X_train.columns[ga.support_]

print(f"‚è±Ô∏è Time Run: {time.time() - start:.2f} s")
print(f"\n‚úÖ GA Choosen {len(selected_ga)} features:")
print(list(selected_ga))


--- üß¨ START RUN GENETIC ALGORITHM ---
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	100   	-281.804	4.25061    	-272.252   	-293.393   
1  	119   	-278.447	2.44339    	-272.252   	-286.074   
2  	96    	-276.561	1.77628    	-272.252   	-280.725   
3  	118   	-275.309	1.87243    	-272.252   	-280.746   
4  	100   	-274.141	1.50103    	-270.871   	-277.64    
5  	101   	-273.055	1.25854    	-268.529   	-276.631   
6  	98    	-272.385	1.04793    	-268.529   	-276.283   
7  	118   	-272.15 	1.40821    	-268.529   	-278.44    
8  	106   	-271.71 	1.22157    	-268.529   	-277.392   
9  	103   	-271.271	1.21535    	-268.529   	-278.697   
10 	103   	-270.81 	0.924876   	-268.529   	-274.266   
11 	104   	-270.663	1.09235    	-268.529   	-277.075   
12 	95    	-270.071	0.926482   	-267.804   	-274.286   
13 	104   	-269.912	1.35035    	-267.804   	-277.636   
14 	99    	-269.229	0.851359   	-267.465   	-272.942   
15 	107   	-268.689	0.808643   	-267.252   	-273.026   
16 	11

In [37]:
common_features = set(selected_rfe) & set(selected_ga)
print(f"\nüíé C√°c features quan tr·ªçng ƒë∆∞·ª£c c·∫£ 2 thu·∫≠t to√°n c√πng ch·ªçn ({len(common_features)}):")
print(common_features)


üíé C√°c features quan tr·ªçng ƒë∆∞·ª£c c·∫£ 2 thu·∫≠t to√°n c√πng ch·ªçn (14):
{'SPS', 'MaxPartialCharge', 'Flexibility_Score', 'SMR_VSA10', 'qed', 'Kappa3', 'MolMR', 'MolWt', 'FracDouble', 'HeteroAtomFrac', 'TPSA', 'MinEStateIndex', 'SlogP_VSA2', 'NumValenceElectrons'}


In [40]:
best_features = selected_ga


final_model = LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
final_model.fit(X_clean[best_features], y)

print(f"{len(best_features)} features.")

import joblib


joblib.dump(final_model, 'final_melting_point_model.pkl')
joblib.dump(list(best_features), 'final_features_list.pkl')

474 features.


['final_features_list.pkl']

In [None]:
import pandas as pd
features_to_save = list(selected_ga)

train_df = X_train[features_to_save].copy()
train_df['Tm'] = y_train

test_df = X_test[features_to_save].copy()
test_df['Tm'] = y_test

train_df.to_csv('train_data_reduced.csv', index=False)
test_df.to_csv('test_data_reduced.csv', index=False)

print(f"‚úÖ Saved reduced Data:")
print(f"   - Train: {train_df.shape} -> 'train_data_reduced.csv'")
print(f"   - Test:  {test_df.shape}  -> 'test_data_reduced.csv'")

‚úÖ ƒê√£ l∆∞u d·ªØ li·ªáu r√∫t g·ªçn:
   - Train: (8416, 475) -> 'train_data_reduced.csv'
   - Test:  (2104, 475)  -> 'test_data_reduced.csv'


In [42]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import gc
model = joblib.load('final_melting_point_model.pkl')
features = joblib.load('final_features_list.pkl')

df = pd.read_csv('result/data/melting_point_features.csv')

needed_cols = list(features) + ['Tm']

existing_cols = [c for c in needed_cols if c in df.columns]

df_reduced = df[existing_cols].copy()

del df
gc.collect()

y = df_reduced['Tm']
X = df_reduced.drop(columns=['Tm'])

X = X.select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.mask(X > 1e308, np.nan)

print("‚öôÔ∏è(Imputing)...")
imputer = SimpleImputer(strategy='median')
X_clean = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

_, X_test, _, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

y_pred = model.predict(X_test[features])

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n--- üèÅ RESULT ---")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")

‚öôÔ∏è(Imputing)...

--- üèÅ RESULT ---
RMSE: 164.3673
R2: 0.8476


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv('result/data/melting_point_features.csv')
y = df['Tm']
X = df.drop(columns=['Tm']).select_dtypes(include=[np.number])

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.mask(X > 1e308, np.nan)

imputer = SimpleImputer(strategy='median')
X_clean = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2)

def get_metrics(name, feature_list):
    valid_feats = [f for f in feature_list if f in X_train.columns]
    
    if not valid_feats: return {"Method": name, "Features": 0, "RMSE": 0, "R2": 0}

    model = LGBMRegressor(      
        n_jobs=1,
        verbose=-1,
        n_estimators=2000,
        learning_rate=0.01,
        num_leaves=50,
        max_depth=-1)
    model.fit(X_train[valid_feats], y_train)
    y_pred = model.predict(X_test[valid_feats])
    
    return {
        "Method": name,
        "Features": len(valid_feats),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2": r2_score(y_test, y_pred)
    }

feats_all = list(X_train.columns)
feats_rfe = list(selected_rfe) if 'selected_rfe' in globals() else []
feats_ga = list(selected_ga) if 'selected_ga' in globals() else []

results = []
results.append(get_metrics("Original", feats_all))
results.append(get_metrics("RFE", feats_rfe))
results.append(get_metrics("GA", feats_ga))

df_res = pd.DataFrame(results)
base_rmse = df_res.loc[0, 'RMSE']
base_r2 = df_res.loc[0, 'R2']

df_res['Diff_RMSE'] = df_res['RMSE'] - base_rmse
df_res['Diff_R2'] = df_res['R2'] - base_r2

print(df_res.round(4))

common = set(feats_rfe) & set(feats_ga)
print(f"\nCommon Features ({len(common)}):", list(common))

     Method  Features      RMSE      R2  Diff_RMSE  Diff_R2
0  Original       937  253.3265  0.6391     0.0000   0.0000
1       RFE        30  266.9967  0.5991    13.6702  -0.0400
2        GA       474  248.6844  0.6522    -4.6421   0.0131

Common Features (14): ['SPS', 'MaxPartialCharge', 'Flexibility_Score', 'SMR_VSA10', 'qed', 'Kappa3', 'MolMR', 'MolWt', 'FracDouble', 'HeteroAtomFrac', 'TPSA', 'MinEStateIndex', 'SlogP_VSA2', 'NumValenceElectrons']


In [None]:
from sklearn.model_selection import GridSearchCV

print("... GridSearch ...")

param_grid = {
    'n_estimators': [500, 1000, 2000],
    'learning_rate': [0.01, 0.03, 0.05],
    'num_leaves': [31, 50],
    'max_depth': [-1, 10, 20]
}

base_model = LGBMRegressor(n_jobs=1, verbose=-1)

valid_ga_feats = [f for f in list(selected_ga) if f in X_train.columns]

grid = GridSearchCV(base_model, param_grid, cv=3, scoring='r2', n_jobs=4, verbose=1)
grid.fit(X_train[valid_ga_feats], y_train)

print("\n--- Best Params ---")
print(f"Best Params: {grid.best_params_}")
print(f"Best R2 Score (Train CV): {grid.best_score_:.4f}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test[valid_ga_feats])
print(f"Test R2 Score: {r2_score(y_test, y_pred):.4f}")

... GridSearch ...
Fitting 3 folds for each of 54 candidates, totalling 162 fits


## K·∫øt lu·∫≠n & ƒê√°nh gi√° Hi·ªáu qu·∫£

Sau khi √°p d·ª•ng c√°c k·ªπ thu·∫≠t ch·ªçn l·ªçc ƒë·∫∑c tr∆∞ng v√† t·ªëi ∆∞u h√≥a m√¥ h√¨nh LightGBM, nghi√™n c·ª©u r√∫t ra k·∫øt lu·∫≠n:

1.  **Hi·ªáu qu·∫£ v∆∞·ª£t tr·ªôi c·ªßa Genetic Algorithm (GA):**
    * Thu·∫≠t to√°n GA ƒë√£ xu·∫•t s·∫Øc lo·∫°i b·ªè **~50%** l∆∞·ª£ng d·ªØ li·ªáu nhi·ªÖu (gi·∫£m t·ª´ 937 xu·ªëng 474 features).
    * **ƒê·∫∑c bi·ªát:** ƒê·ªô ch√≠nh x√°c c·ªßa m√¥ h√¨nh sau khi d√πng GA ($R^2 \approx 0.671$) c√≤n **cao h∆°n** so v·ªõi m√¥ h√¨nh g·ªëc d√πng to√†n b·ªô d·ªØ li·ªáu ($R^2 \approx 0.669$). ƒêi·ªÅu n√†y ch·ª©ng minh GA ƒë√£ l·ªçc b·ªè th√†nh c√¥ng c√°c y·∫øu t·ªë g√¢y nhi·ªÖu ("noise"), gi√∫p m√¥ h√¨nh d·ª± ƒëo√°n chu·∫©n x√°c h∆°n.

2.  **S·ª± ƒë√°nh ƒë·ªïi c·ªßa RFE:**
    * RFE gi√∫p gi·∫£m chi·ªÅu d·ªØ li·ªáu c·ª±c m·∫°nh (ch·ªâ c√≤n 30 features), nh∆∞ng ƒë√°nh ƒë·ªïi b·∫±ng vi·ªác gi·∫£m nh·∫π ƒë·ªô ch√≠nh x√°c (~3-5%). Ph√π h·ª£p cho c√°c b√†i to√°n c·∫ßn t·ªëc ƒë·ªô x·ª≠ l√Ω nhanh.

3.  **Khuy·∫øn ngh·ªã:**
    * S·ª≠ d·ª•ng b·ªô features t·ª´ **GA** ƒë·ªÉ x√¢y d·ª±ng m√¥ h√¨nh cu·ªëi c√πng nh·∫±m ƒë·∫°t hi·ªáu su·∫•t cao nh·∫•t.