In [1]:
import pandas as pd
import arff as liacarff

# 加载 ARFF 文件
with open('PROMISE_exp.arff', 'r') as f:
    data_dict = liacarff.load(f)

# 提取数据
data = data_dict['data']

# 转换为 DataFrame
df = pd.DataFrame(data, columns=[attr[0] for attr in data_dict['attributes']])
df['ProjectID'] = df['ProjectID'].astype(int)  # 将 ProjectID 列转换为整数类型

# 显示 DataFrame
print(df)

     ProjectID                                    RequirementText _class_
0            1  The system shall refresh the display every 60 ...      PE
1            1  The application shall match the color of the s...      LF
2            1  If projected  the data must be readable.  On a...      US
3            1  The product shall be available during normal b...       A
4            1  If projected  the data must be understandable....      US
..         ...                                                ...     ...
965         48  Registered User must be able to maintain his/h...       F
966         48  The entire website must be user-friendly and e...      US
967         48  The system shall support up to 10000 simultane...      PE
968         48  The website must provide highest degree of sec...      SE
969         49  The software application should be easily tran...      PO

[970 rows x 3 columns]


In [2]:
# DATA PREPROCESSING
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string

# 下载NLTK的停用词和词性标注器数据
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lower
df['RequirementText'] = df['RequirementText'].str.lower()

# Remove punctuation, leading and trailing spaces
df['RequirementText'] = df['RequirementText'].apply(lambda x: re.sub(f"[{re.escape(string.punctuation)}]", "", x.strip()))

# 分词
df['RequirementText'] = df['RequirementText'].apply(word_tokenize)

# stopwords
# stop_words = set(stopwords.words('english'))
# df['RequirementText'] = df['RequirementText'].apply(lambda x: [word for word in x if word not in stop_words])

# stemming
# stemmer = PorterStemmer()
# df['RequirementText'] = df['RequirementText'].apply(lambda x: [stemmer.stem(word) for word in x])

# lemmatization
lemmatizer = WordNetLemmatizer()
df['RequirementText'] = df['RequirementText'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# 合并词语为字符串
df['RequirementText'] = df['RequirementText'].apply(lambda x: ' '.join(x))

# Replace multiple spaces with a single space
df['RequirementText'] = df['RequirementText'].apply(lambda x: re.sub(' +', ' ', x))

# 移除整个列中每个字符串首尾的全部空格
df['RequirementText'] = df['RequirementText'].str.strip()

# 查看处理后的数据集
print(df.head())

[nltk_data] Downloading package stopwords to /home/li/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/li/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/li/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   ProjectID                                    RequirementText _class_
0          1  the system shall refresh the display every 60 ...      PE
1          1  the application shall match the color of the s...      LF
2          1  if projected the data must be readable on a 10...      US
3          1  the product shall be available during normal b...       A
4          1  if projected the data must be understandable o...      US


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [4]:
class_distribution = df['_class_'].value_counts()
print(class_distribution)

_class_
F     444
SE    125
US     85
O      77
PE     67
LF     49
A      31
MN     24
SC     22
FT     18
L      15
PO     13
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
y = df['_class_']
X = df['RequirementText']

# 假设 X 是特征，y 是目标变量（类别标签）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

print(y_test)
class_distribution = y_test.value_counts()
print(class_distribution)

297     O
669     F
586     A
211    US
65      F
       ..
818     F
862     F
512     F
513     F
441     O
Name: _class_, Length: 97, dtype: object
_class_
F     44
SE    13
US     9
O      8
PE     7
LF     5
A      3
MN     2
SC     2
FT     2
L      1
PO     1
Name: count, dtype: int64


In [6]:
from sklearn.ensemble import RandomForestClassifier
# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = RandomForestClassifier()
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# 评估模型性能
print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf))

Accuracy (TF-IDF): 0.6597938144329897
Classification Report (TF-IDF):
               precision    recall  f1-score   support

           A       1.00      0.67      0.80         3
           F       0.63      0.95      0.76        44
          FT       0.00      0.00      0.00         2
           L       0.00      0.00      0.00         1
          LF       0.50      0.20      0.29         5
          MN       0.00      0.00      0.00         2
           O       0.80      0.50      0.62         8
          PE       0.67      0.57      0.62         7
          PO       1.00      1.00      1.00         1
          SC       0.00      0.00      0.00         2
          SE       0.86      0.46      0.60        13
          US       0.57      0.44      0.50         9

    accuracy                           0.66        97
   macro avg       0.50      0.40      0.43        97
weighted avg       0.63      0.66      0.62        97



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
# 导入 Random Forest 模型
from sklearn.ensemble import RandomForestClassifier

# 创建一个Pipeline，仅包括 Random Forest 模型
pipeline_rf = Pipeline([
    ('clf', RandomForestClassifier())
])

# 设置需要调优的参数，包括 Random Forest 的参数
parameters_rf = {
    'clf__n_estimators': [50, 100, 200],
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# 使用 GridSearchCV 进行模型选择和调参，设置评价标准为 f1 score
grid_search_rf = GridSearchCV(pipeline_rf, parameters_rf, scoring='f1_weighted', cv=5, refit=True)
grid_search_rf.fit(X_train_tfidf, y_train)

# 输出最佳参数和对应的 f1 score
print("Best parameters found (Random Forest): ", grid_search_rf.best_params_)
print("Best f1 score (Random Forest): ", grid_search_rf.best_score_)

best_model_rf = grid_search_rf.best_estimator_

y_pred_tuned_rf = best_model_rf.predict(X_test_tfidf)

# 输出准确率和分类报告
print("Accuracy (Tuned - Random Forest):", accuracy_score(y_test, y_pred_tuned_rf))
print("Classification Report (Tuned - Random Forest):\n", classification_report(y_test, y_pred_tuned_rf))

Best parameters found (Random Forest):  {'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}
Best f1 score (Random Forest):  0.6137654999231683
Accuracy (Tuned - Random Forest): 0.6701030927835051
Classification Report (Tuned - Random Forest):
               precision    recall  f1-score   support

           A       0.67      0.67      0.67         3
           F       0.66      0.98      0.79        44
          FT       0.00      0.00      0.00         2
           L       0.00      0.00      0.00         1
          LF       0.50      0.20      0.29         5
          MN       0.00      0.00      0.00         2
           O       0.80      0.50      0.62         8
          PE       0.57      0.57      0.57         7
          PO       1.00      1.00      1.00         1
          SC       0.00      0.00      0.00         2
          SE       0.75      0.46      0.57        13
          US       0.67  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# F binary

df['binary_label'] = df['_class_'].apply(lambda x: 1 if x == 'F' else 0)
X_train, X_test, y_train, y_test = train_test_split(df['RequirementText'], df['binary_label'], test_size=0.2, random_state=42)

# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf))

Accuracy (TF-IDF): 0.8247422680412371
Classification Report (TF-IDF):
               precision    recall  f1-score   support

           0       0.81      0.87      0.84       102
           1       0.85      0.77      0.81        92

    accuracy                           0.82       194
   macro avg       0.83      0.82      0.82       194
weighted avg       0.83      0.82      0.82       194



In [9]:
filtered_df = df[df['_class_'] != 'F']
X_train, X_test, y_train, y_test = train_test_split(filtered_df['RequirementText'], filtered_df['_class_'], test_size=0.2, random_state=42, stratify=filtered_df['_class_'])

# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf, zero_division=1))

Accuracy (TF-IDF): 0.5377358490566038
Classification Report (TF-IDF):
               precision    recall  f1-score   support

           A       1.00      0.17      0.29         6
          FT       1.00      0.00      0.00         4
           L       1.00      0.00      0.00         3
          LF       0.33      0.10      0.15        10
          MN       1.00      0.00      0.00         5
           O       0.55      0.69      0.61        16
          PE       0.69      0.69      0.69        13
          PO       1.00      0.00      0.00         3
          SC       1.00      0.00      0.00         4
          SE       0.51      0.88      0.65        25
          US       0.50      0.76      0.60        17

    accuracy                           0.54       106
   macro avg       0.78      0.30      0.27       106
weighted avg       0.64      0.54      0.46       106



In [10]:
filtered_df = df[df['_class_'].isin(['MN', 'SC', 'PE'])]
X_train, X_test, y_train, y_test = train_test_split(filtered_df['RequirementText'], filtered_df['_class_'], test_size=0.2, random_state=42, stratify=filtered_df['_class_'])

# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf, zero_division=1))

Accuracy (TF-IDF): 0.6086956521739131
Classification Report (TF-IDF):
               precision    recall  f1-score   support

          MN       1.00      0.00      0.00         5
          PE       0.61      1.00      0.76        14
          SC       1.00      0.00      0.00         4

    accuracy                           0.61        23
   macro avg       0.87      0.33      0.25        23
weighted avg       0.76      0.61      0.46        23



In [11]:
filtered_df = df[df['_class_'].isin(['A', 'FT'])]
X_train, X_test, y_train, y_test = train_test_split(filtered_df['RequirementText'], filtered_df['_class_'], test_size=0.2, random_state=42, stratify=filtered_df['_class_'])

# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf, zero_division=1))

Accuracy (TF-IDF): 0.7
Classification Report (TF-IDF):
               precision    recall  f1-score   support

           A       0.67      1.00      0.80         6
          FT       1.00      0.25      0.40         4

    accuracy                           0.70        10
   macro avg       0.83      0.62      0.60        10
weighted avg       0.80      0.70      0.64        10



In [12]:
filtered_df = df[df['_class_'].isin(['SE', 'L'])]
X_train, X_test, y_train, y_test = train_test_split(filtered_df['RequirementText'], filtered_df['_class_'], test_size=0.2, random_state=42, stratify=filtered_df['_class_'])

# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf, zero_division=1))

Accuracy (TF-IDF): 0.8928571428571429
Classification Report (TF-IDF):
               precision    recall  f1-score   support

           L       1.00      0.00      0.00         3
          SE       0.89      1.00      0.94        25

    accuracy                           0.89        28
   macro avg       0.95      0.50      0.47        28
weighted avg       0.90      0.89      0.84        28



In [13]:
filtered_df = df[df['_class_'].isin(['O', 'US'])]
X_train, X_test, y_train, y_test = train_test_split(filtered_df['RequirementText'], filtered_df['_class_'], test_size=0.2, random_state=42, stratify=filtered_df['_class_'])

# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf, zero_division=1))

Accuracy (TF-IDF): 0.8484848484848485
Classification Report (TF-IDF):
               precision    recall  f1-score   support

           O       0.92      0.75      0.83        16
          US       0.80      0.94      0.86        17

    accuracy                           0.85        33
   macro avg       0.86      0.85      0.85        33
weighted avg       0.86      0.85      0.85        33

