In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import LSTM, Dropout, Dense # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
processed_text = pd.read_csv("processed_text2.csv")
# semantic_features = pd.read_csv("semantic_features.csv")
readiblity_features = pd.read_csv("readiblity_features_1.csv")
# syntax_features = pd.read_csv("syntax_features.csv")
# structure_features = pd.read_csv("structure_features.csv")
# sentiment_features = pd.read_csv("sentiment_features.csv")
# features1 = pd.concat((semantic_features,readiblity_features,syntax_features,structure_features,sentiment_features),axis=1)
features1 = readiblity_features
features1 

Unnamed: 0,SMOG,Flesch_Reading_Ease,Flesch_Kincaid_Grade,Gunning_FOG,Entropy
0,13.4,45.46,11.2,12.72,6.584422
1,6.1,85.69,4.0,5.06,6.062130
2,6.8,86.60,3.7,5.51,6.368750
3,11.0,65.01,7.8,9.52,6.661220
4,9.7,73.78,6.5,8.54,6.883176
...,...,...,...,...,...
14849,8.8,81.83,5.5,8.18,6.652581
14850,8.0,77.87,7.0,8.70,6.045370
14851,8.5,80.21,6.1,8.09,6.053863
14852,9.1,73.88,6.5,7.83,6.543425


In [3]:
# Add the 'label' column from processed_text to the semantic_features dataframe
features1['label'] = processed_text['label']

# Now we have a new dataframe with all semantic features and the label
features1_detect = features1

# Display the first few rows of the new dataframe
features1_detect.shape
features1_detect

Unnamed: 0,SMOG,Flesch_Reading_Ease,Flesch_Kincaid_Grade,Gunning_FOG,Entropy,label
0,13.4,45.46,11.2,12.72,6.584422,0
1,6.1,85.69,4.0,5.06,6.062130,0
2,6.8,86.60,3.7,5.51,6.368750,0
3,11.0,65.01,7.8,9.52,6.661220,0
4,9.7,73.78,6.5,8.54,6.883176,0
...,...,...,...,...,...,...
14849,8.8,81.83,5.5,8.18,6.652581,1
14850,8.0,77.87,7.0,8.70,6.045370,1
14851,8.5,80.21,6.1,8.09,6.053863,1
14852,9.1,73.88,6.5,7.83,6.543425,1


In [4]:
# Splitting the data into features (X) and target (y)
x = features1_detect.drop(columns=['label'])
y = features1_detect['label']

In [5]:
# 資料標準化
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(x)  # X 是原始的特徵資料

In [6]:
def create_model(units, dropout_rate, learning_rate):
    model = Sequential([
        LSTM(units, input_shape=(X_normalized.shape[1], 1), return_sequences=True),
        Dropout(dropout_rate),
        LSTM(units),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [7]:
def specificity_score(y_true, y_pred):
    tn, fp, _, _ = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return specificity

specificity_scorer = make_scorer(specificity_score, greater_is_better=True)

In [8]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'specificity': specificity_scorer
}

: 

In [9]:
# 創建KerasClassifier包裝器
model = KerasClassifier(build_fn=create_model, verbose=1)
# 设置网格搜索的参数，注意这里的格式
param_grid = {
    'model__units': [64,128],
    'model__dropout_rate': [0.2],
    'model__learning_rate': [0.0003,0.0005,0.001],
    'epochs': [50,75],
    'batch_size': [32,64]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, refit='f1', cv=10, n_jobs=-1, verbose=3)
# 设置网格搜索的参数，注意这里的格式

# 假設X_normalized和y已經被定義且正確預處理
# 執行網格搜索
grid_result = grid.fit(X_normalized, y)



Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [None]:
grid_result.best_estimator_

In [None]:


print("最佳 Accuracy: %f 使用 %s" % (grid_result.cv_results_['mean_test_accuracy'][grid_result.best_index_], grid_result.best_params_))
print("最佳 Precision: %f 使用 %s" % (grid_result.cv_results_['mean_test_precision'][grid_result.best_index_], grid_result.best_params_))
print("最佳 Recall: %f 使用 %s" % (grid_result.cv_results_['mean_test_recall'][grid_result.best_index_], grid_result.best_params_))
print("最佳 Specificity: %f 使用 %s" % (grid_result.cv_results_['mean_test_specificity'][grid_result.best_index_], grid_result.best_params_))
print("最佳 F1: %f 使用 %s" % (grid_result.cv_results_['mean_test_f1'][grid_result.best_index_], grid_result.best_params_))


In [22]:
result={'Accuracy': grid_result.cv_results_['mean_test_accuracy'][grid_result.best_index_],
    'Precision': grid_result.cv_results_['mean_test_precision'][grid_result.best_index_],
    'Recall': grid_result.cv_results_['mean_test_recall'][grid_result.best_index_],
    'Specificity': grid_result.cv_results_['mean_test_specificity'][grid_result.best_index_],
    'F1': grid_result.cv_results_['mean_test_f1'][grid_result.best_index_],
    'Best_Params': [
        {'batch_size': 64, 'epochs': 75, 'model__dropout_rate': 0.2, 'model__learning_rate': 0.001, 'model__units': 128}]}
df_result = pd.DataFrame([result])
df_result.to_csv('readibility_LSTM_1.csv',index=False)