In [1]:
import pandas as pd
import arff as liacarff

# 加载 ARFF 文件
with open('PROMISE_DOSSPRE_OLD_3.arff', 'r') as f:
    data_dict = liacarff.load(f)

# 提取数据
data = data_dict['data']

# 转换为 DataFrame
df = pd.DataFrame(data, columns=[attr[0] for attr in data_dict['attributes']])
df['ProjectID'] = df['ProjectID'].astype(int)  # 将 ProjectID 列转换为整数类型

# 显示 DataFrame
print(df)


      ProjectID                                    RequirementText _class_
0             1  The system shall refresh the display every 60 ...      PE
1             1  The application shall match the color of the s...      LF
2             1  If projected  the data must be readable.  On a...      US
3             1  The product shall be available during normal b...       A
4             1  If projected  the data must be understandable....      US
...         ...                                                ...     ...
2052          9  The database may trade off fidelity through ca...      FT
2053          9  The API shall have master topology replicating...      FT
2054          9  The system must parse, filter, transform and s...      FT
2055          9  The application shall employ real-user monitor...      FT
2056          9  The software should apply graceful degradation...      FT

[2057 rows x 3 columns]


In [2]:
# DATA PREPROCESSING
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string

# 下载NLTK的停用词和词性标注器数据
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lower
df['RequirementText'] = df['RequirementText'].str.lower()

# # Remove punctuation, leading and trailing spaces
# df['RequirementText'] = df['RequirementText'].apply(lambda x: re.sub(f"[{re.escape(string.punctuation)}]", "", x.strip()))

# # 分词
# df['RequirementText'] = df['RequirementText'].apply(word_tokenize)

# # stopwords
# stop_words = set(stopwords.words('english'))
# df['RequirementText'] = df['RequirementText'].apply(lambda x: [word for word in x if word not in stop_words])

# # stemming
# # stemmer = PorterStemmer()
# # df['RequirementText'] = df['RequirementText'].apply(lambda x: [stemmer.stem(word) for word in x])

# # lemmatization
# lemmatizer = WordNetLemmatizer()
# df['RequirementText'] = df['RequirementText'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# # 合并词语为字符串
# df['RequirementText'] = df['RequirementText'].apply(lambda x: ' '.join(x))

# # Replace multiple spaces with a single space
# df['RequirementText'] = df['RequirementText'].apply(lambda x: re.sub(' +', ' ', x))

# 移除整个列中每个字符串首尾的全部空格
df['RequirementText'] = df['RequirementText'].str.strip()

# 查看处理后的数据集
print(df.head())

   ProjectID                                    RequirementText _class_
0          1  the system shall refresh the display every 60 ...      PE
1          1  the application shall match the color of the s...      LF
2          1  if projected  the data must be readable.  on a...      US
3          1  the product shall be available during normal b...       A
4          1  if projected  the data must be understandable....      US


[nltk_data] Downloading package stopwords to /home/li/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/li/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/li/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [4]:
class_distribution = df['_class_'].value_counts()
print(class_distribution)

_class_
F     444
PE    250
SE    235
US    219
LF    164
MN    162
O     131
A     120
SC     96
L      86
PO     78
FT     72
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

class_mapping = {'F': 0, 'A': 1, 'L': 2, 'LF': 3, 'MN': 4, 'O': 5, 'PE': 6, 'SC': 7, 'SE': 8, 'US': 9, 'FT': 10, 'PO': 11}
df['_class_'] = df['_class_'].map(class_mapping)

y = df['_class_']
X = df['RequirementText']
                                  
# 假设 X 是特征，y 是目标变量（类别标签）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [6]:
import fasttext

# 将数据保存到文件中，以便 FastText 进行训练
train_data_path = 'train_data.txt'
test_data_path = 'test_data.txt'

with open(train_data_path, 'w', encoding='utf-8') as f:
    for text, label in zip(X_train, y_train):
        f.write(f'__label__{label} {text}\n')

with open(test_data_path, 'w', encoding='utf-8') as f:
    for text in X_test:
        f.write(f'{text}\n')

# 训练 FastText 模型
model = fasttext.train_supervised(input=train_data_path)

# 在测试集上进行预测
y_pred = [int(model.predict(text)[0][0][-1]) for text in X_test]

Read 0M words
Number of words:  4721
Number of labels: 12
Progress: 100.0% words/sec/thread: 1045513 lr:  0.000000 avg.loss:  2.217226 ETA:   0h 0m 0s


In [7]:
# 评估模型性能
print("Accuracy (FastText):", accuracy_score(y_test, y_pred))
print("Classification Report (FastText):\n", classification_report(y_test, y_pred))

Accuracy (FastText): 0.2669902912621359
Classification Report (FastText):
               precision    recall  f1-score   support

           0       0.25      1.00      0.40        44
           1       0.00      0.00      0.00        12
           2       0.50      0.11      0.18         9
           3       0.38      0.62      0.48        16
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        25
           7       0.00      0.00      0.00        10
           8       0.00      0.00      0.00        24
           9       0.00      0.00      0.00        22
          10       0.00      0.00      0.00         7
          11       0.00      0.00      0.00         8

    accuracy                           0.27       206
   macro avg       0.09      0.14      0.09       206
weighted avg       0.10      0.27      0.13       206



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model = fasttext.train_supervised(input=train_data_path, autotuneValidationFile=test_data_path)

# 在测试集上进行预测
y_pred = [int(model.predict(text)[0][0][-1]) for text in X_test]

# 评估模型性能
print("Accuracy (FastText):", accuracy_score(y_test, y_pred))
print("Classification Report (FastText):\n", classification_report(y_test, y_pred))

Progress:   1.7% Trials:   11 Best score:       nan ETA:   0h 4m54s