In [1]:
import pandas as pd
import arff as liacarff

# 加载 ARFF 文件
with open('PROMISE_DOSSPRE_OLD_3.arff', 'r') as f:
    data_dict = liacarff.load(f)

# 提取数据
data = data_dict['data']

# 转换为 DataFrame
df = pd.DataFrame(data, columns=[attr[0] for attr in data_dict['attributes']])
df['ProjectID'] = df['ProjectID'].astype(int)  # 将 ProjectID 列转换为整数类型

# 显示 DataFrame
print(df)

      ProjectID                                    RequirementText _class_
0             1  The system shall refresh the display every 60 ...      PE
1             1  The application shall match the color of the s...      LF
2             1  If projected  the data must be readable.  On a...      US
3             1  The product shall be available during normal b...       A
4             1  If projected  the data must be understandable....      US
...         ...                                                ...     ...
2052          9  The database may trade off fidelity through ca...      FT
2053          9  The API shall have master topology replicating...      FT
2054          9  The system must parse, filter, transform and s...      FT
2055          9  The application shall employ real-user monitor...      FT
2056          9  The software should apply graceful degradation...      FT

[2057 rows x 3 columns]


In [2]:
# DATA PREPROCESSING
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string

# 下载NLTK的停用词和词性标注器数据
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# # lower
df['RequirementText'] = df['RequirementText'].str.lower()

# # Remove punctuation, leading and trailing spaces
# df['RequirementText'] = df['RequirementText'].apply(lambda x: re.sub(f"[{re.escape(string.punctuation)}]", "", x.strip()))

# # 分词
df['RequirementText'] = df['RequirementText'].apply(word_tokenize)

# # stopwords
stop_words = set(stopwords.words('english'))
df['RequirementText'] = df['RequirementText'].apply(lambda x: [word for word in x if word not in stop_words])

# # stemming
# stemmer = PorterStemmer()
# df['RequirementText'] = df['RequirementText'].apply(lambda x: [stemmer.stem(word) for word in x])

# # lemmatization
# lemmatizer = WordNetLemmatizer()
# df['RequirementText'] = df['RequirementText'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# # 合并词语为字符串
df['RequirementText'] = df['RequirementText'].apply(lambda x: ' '.join(x))

# Replace multiple spaces with a single space
df['RequirementText'] = df['RequirementText'].apply(lambda x: re.sub(' +', ' ', x))

# 移除整个列中每个字符串首尾的全部空格
df['RequirementText'] = df['RequirementText'].str.strip()

# 查看处理后的数据集
print(df.head())

[nltk_data] Downloading package stopwords to /home/li/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/li/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/li/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   ProjectID                                    RequirementText _class_
0          1    system shall refresh display every 60 seconds .      PE
1          1  application shall match color schema set forth...      LF
2          1  projected data must readable . 10x10 projectio...      US
3          1  product shall available normal business hours ...       A
4          1  projected data must understandable . 10x10 pro...      US


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [4]:
from sklearn.model_selection import train_test_split
y = df['_class_']
X = df['RequirementText']

# 假设 X 是特征，y 是目标变量（类别标签）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

print(y_test)
class_distribution = y_test.value_counts()
print(class_distribution)

1106    US
1795    SC
1914     O
927      F
699     PE
        ..
1741    LF
892      F
2005     A
117      F
694     SE
Name: _class_, Length: 206, dtype: object
_class_
F     44
PE    25
SE    24
US    22
MN    16
LF    16
O     13
A     12
SC    10
L      9
PO     8
FT     7
Name: count, dtype: int64


In [5]:
from sklearn.ensemble import RandomForestClassifier
# 使用TfidfVectorizer进行转换
ngram_range = (1,2)
min_df = 0.01 # ignore terms that appear in less than 1% of the documents
max_df = 0.8 # ignore terms that appear in more than 80% of the documents

tfidf_vectorizer = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = RandomForestClassifier()
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# 评估模型性能
print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf))

Accuracy (TF-IDF): 0.587378640776699
Classification Report (TF-IDF):
               precision    recall  f1-score   support

           A       0.40      0.50      0.44        12
           F       0.59      0.80      0.68        44
          FT       0.50      0.29      0.36         7
           L       0.60      0.33      0.43         9
          LF       0.68      0.81      0.74        16
          MN       0.62      0.50      0.55        16
           O       0.64      0.54      0.58        13
          PE       0.67      0.48      0.56        25
          PO       0.57      0.50      0.53         8
          SC       0.86      0.60      0.71        10
          SE       0.42      0.42      0.42        24
          US       0.62      0.68      0.65        22

    accuracy                           0.59       206
   macro avg       0.60      0.54      0.56       206
weighted avg       0.59      0.59      0.58       206



In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
# 导入 Random Forest 模型
from sklearn.ensemble import RandomForestClassifier

# 创建一个Pipeline，仅包括 Random Forest 模型
pipeline_rf = Pipeline([
    ('clf', RandomForestClassifier())
])

# 设置需要调优的参数，包括 Random Forest 的参数
parameters_rf = {
    'clf__n_estimators': [50, 100, 200],
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# 使用 GridSearchCV 进行模型选择和调参，设置评价标准为 f1 score
grid_search_rf = GridSearchCV(pipeline_rf, parameters_rf, scoring='f1_weighted', cv=5, refit=True)
grid_search_rf.fit(X_train_tfidf, y_train)

# 输出最佳参数和对应的 f1 score
print("Best parameters found (Random Forest): ", grid_search_rf.best_params_)
print("Best f1 score (Random Forest): ", grid_search_rf.best_score_)

best_model_rf = grid_search_rf.best_estimator_

y_pred_tuned_rf = best_model_rf.predict(X_test_tfidf)

# 输出准确率和分类报告
print("Accuracy (Tuned - Random Forest):", accuracy_score(y_test, y_pred_tuned_rf))
print("Classification Report (Tuned - Random Forest):\n", classification_report(y_test, y_pred_tuned_rf))

Best parameters found (Random Forest):  {'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5, 'clf__n_estimators': 100}
Best f1 score (Random Forest):  0.5711063315098333
Accuracy (Tuned - Random Forest): 0.587378640776699
Classification Report (Tuned - Random Forest):
               precision    recall  f1-score   support

           A       0.42      0.42      0.42        12
           F       0.62      0.77      0.69        44
          FT       0.50      0.29      0.36         7
           L       0.67      0.22      0.33         9
          LF       0.65      0.69      0.67        16
          MN       0.53      0.56      0.55        16
           O       0.67      0.62      0.64        13
          PE       0.74      0.56      0.64        25
          PO       0.60      0.38      0.46         8
          SC       1.00      0.60      0.75        10
          SE       0.39      0.50      0.44        24
          US       0.60   