In [1]:
import pandas as pd
import arff as liacarff

# 加载 ARFF 文件
with open('PROMISE_exp.arff', 'r') as f:
    data_dict = liacarff.load(f)

# 提取数据
data = data_dict['data']

# 转换为 DataFrame
df = pd.DataFrame(data, columns=[attr[0] for attr in data_dict['attributes']])
df['ProjectID'] = df['ProjectID'].astype(int)  # 将 ProjectID 列转换为整数类型

# 显示 DataFrame
print(df)

     ProjectID                                    RequirementText _class_
0            1  The system shall refresh the display every 60 ...      PE
1            1  The application shall match the color of the s...      LF
2            1  If projected  the data must be readable.  On a...      US
3            1  The product shall be available during normal b...       A
4            1  If projected  the data must be understandable....      US
..         ...                                                ...     ...
965         48  Registered User must be able to maintain his/h...       F
966         48  The entire website must be user-friendly and e...      US
967         48  The system shall support up to 10000 simultane...      PE
968         48  The website must provide highest degree of sec...      SE
969         49  The software application should be easily tran...      PO

[970 rows x 3 columns]


In [2]:
# DATA PREPROCESSING
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string

# 下载NLTK的停用词和词性标注器数据
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lower
df['RequirementText'] = df['RequirementText'].str.lower()

# Remove punctuation, leading and trailing spaces
df['RequirementText'] = df['RequirementText'].apply(lambda x: re.sub(f"[{re.escape(string.punctuation)}]", "", x.strip()))

# 分词
df['RequirementText'] = df['RequirementText'].apply(word_tokenize)

# stopwords
stop_words = set(stopwords.words('english'))
df['RequirementText'] = df['RequirementText'].apply(lambda x: [word for word in x if word not in stop_words])

# stemming
# stemmer = PorterStemmer()
# df['RequirementText'] = df['RequirementText'].apply(lambda x: [stemmer.stem(word) for word in x])

# lemmatization
lemmatizer = WordNetLemmatizer()
df['RequirementText'] = df['RequirementText'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# 合并词语为字符串
df['RequirementText'] = df['RequirementText'].apply(lambda x: ' '.join(x))

# Replace multiple spaces with a single space
df['RequirementText'] = df['RequirementText'].apply(lambda x: re.sub(' +', ' ', x))

# 移除整个列中每个字符串首尾的全部空格
df['RequirementText'] = df['RequirementText'].str.strip()

# 查看处理后的数据集
print(df.head())

[nltk_data] Downloading package stopwords to /home/li/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/li/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/li/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   ProjectID                                    RequirementText _class_
0          1       system shall refresh display every 60 second      PE
1          1  application shall match color schema set forth...      LF
2          1  projected data must readable 10x10 projection ...      US
3          1  product shall available normal business hour l...       A
4          1  projected data must understandable 10x10 proje...      US


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [4]:
from sklearn.model_selection import train_test_split
y = df['_class_']
X = df['RequirementText']

# 假设 X 是特征，y 是目标变量（类别标签）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

features_train = tfidf_vectorizer.fit_transform(X_train).toarray()
labels_train = y_train
features_test = tfidf_vectorizer.transform(X_test).toarray()
labels_test = y_test

In [13]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder

feature_names = tfidf_vectorizer.get_feature_names_out()
input_dimension = len(feature_names)

label_encoder = LabelEncoder()
labels_train_encoded = label_encoder.fit_transform(labels_train)
labels_test_encoded = label_encoder.transform(labels_test)

labels_train_one_hot = to_categorical(labels_train_encoded)
labels_test_one_hot = to_categorical(labels_test_encoded)

model = Sequential()
model.add(Dense(24, input_dim=input_dimension, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(12, activation='softmax'))

# 创建 EarlyStopping 回调
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# 编译模型
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 训练模型，并使用 EarlyStopping 回调
history = model.fit(features_train, labels_train_one_hot, epochs=100, batch_size=16, 
                    validation_data=(features_test, labels_test_one_hot),
                    callbacks=[early_stopping])

accuracy = model.evaluate(features_test, labels_test_one_hot)[1]
print("Test Accuracy:", accuracy)

y_pred_prob = model.predict(features_test)
y_pred = np.argmax(y_pred_prob, axis=1)

report = classification_report(np.argmax(labels_test_one_hot, axis=1), y_pred)
print(report)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Test Accuracy: 0.6494845151901245
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.76      0.89      0.82        44
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         1
           4       0.67      0.40      0.50         5
           5       0.00      0.00      0.00         2
           6       0.38      0.38      0.38         8
           7       0.56      0.71      0.63         7
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         2
          10       0.64      0.69      0.67        13
          11       0.42      0.56      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

# 定义模型构建函数
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_1', min_value=8, max_value=256, step=8),
                    activation='relu', input_dim=input_dimension))
    model.add(Dense(units=hp.Int('units_2', min_value=8, max_value=256, step=8),
                    activation='relu'))
    model.add(Dense(12, activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 创建 EarlyStopping 回调
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# 创建调优器
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,  # 试验的次数
    executions_per_trial=3,  # 每个试验运行的次数
    directory='my_tuning_dir',  # 存储结果的目录
    project_name='my_neural_network_tuning'
)

# 运行调优
tuner.search(features_train, labels_train_one_hot, epochs=100, batch_size=16, 
             validation_data=(features_test, labels_test_one_hot),
             callbacks=[early_stopping])

# 获取最佳模型
best_model = tuner.get_best_models(num_models=1)[0]

# 评估最佳模型
accuracy = best_model.evaluate(features_test, labels_test_one_hot)[1]
print("Test Accuracy (Best Model):", accuracy)

# 预测最佳模型的输出
y_pred_prob = best_model.predict(features_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# 输出分类报告
report = classification_report(np.argmax(labels_test_one_hot, axis=1), y_pred)
print(report)


Trial 5 Complete [00h 00m 07s]
val_accuracy: 0.7388315995534261

Best val_accuracy So Far: 0.7525773048400879
Total elapsed time: 00h 00m 37s
Test Accuracy (Best Model): 0.7628865838050842
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.81      0.98      0.89        44
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         1
           4       0.50      0.60      0.55         5
           5       0.50      0.50      0.50         2
           6       0.75      0.38      0.50         8
           7       0.83      0.71      0.77         7
           8       0.00      0.00      0.00         1
           9       1.00      0.50      0.67         2
          10       0.71      0.77      0.74        13
          11       0.67      0.67      0.67         9

    accuracy                           0.76        97
   macro avg       0.56      0.48      0.51        97

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
