In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random

In [2]:

# Choose the project (options: 'pytorch', 'tensorflow', 'keras', 'incubator-mxnet', 'caffe')
project = 'caffe'
path = f'datasets/{project}.csv'

# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

out_csv_name = f'{project}_unprocessed_NB.csv'

datasets = pd.DataFrame()
print(type(datasets))

df_file = pd.read_csv(path)
df_file['text'] = df_file['Title'].fillna('') + " " + df_file['Body'].fillna('') + " " + df_file['Comments'].fillna('')  # 组合文本字段
#class或related为1则为性能相关
#df= df[(df['related'] == 1) | (df['class'] == 1)]
df_file['output']= (df_file['related'] == 1) | (df_file['class'] == 1)
df_file['output'] = df_file['output'].astype(int)  # 转换为 0/1 格式
new_df=df_file[['text','output']]
datasets=pd.concat([datasets,new_df],ignore_index=True)

#print(datasets)
#datasets=datasets.sample(frac=1).reset_index(drop=True)
# 定义特征和标签
x = datasets['text']
y = datasets['output']

# TF-IDF 特征提取
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=1000  # Adjust as needed
)


accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

REPEAT=30 # 进行 30 次实验，避免随机性影响

for _ in range(REPEAT):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=None)
    # 统计各类别在训练集和测试集中的比例
    print("训练集：")
    print(y_train.value_counts(normalize=True))  # `related=1` 的比例
    x_train_tfidf = vectorizer.fit_transform(x_train).toarray()
    x_test_tfidf = vectorizer.transform(x_test).toarray()
    # 朴素贝叶斯分类器
    clf = GaussianNB()
    grid = GridSearchCV(
        clf,
        params,
        cv=5,              # 5-fold CV (can be changed)
        scoring='roc_auc'  # Using roc_auc as the metric for selection
    )
    grid.fit(x_train_tfidf, y_train)
    # Retrieve the best model
    model = grid.best_estimator_
    model.fit(x_train_tfidf, y_train)
    
    # 预测
    y_pred = model.predict(x_test_tfidf)

    
    # 计算指标
    accuracy_list.append(accuracy_score(y_test, y_pred, zero_division=0))
    precision_list.append(precision_score(y_test, y_pred, zero_division=0))
    recall_list.append(recall_score(y_test, y_pred, zero_division=0))
    f1_list.append(f1_score(y_test, y_pred, zero_division=0))



# 计算平均值
accuracy_avg=np.mean(accuracy_list)
precision_avg = np.mean(precision_list)
recall_avg = np.mean(recall_list)
f1_avg = np.mean(f1_list)

print(f"Average Precision: {precision_avg:.4f}")
print(f"Average Recall: {recall_avg:.4f}")
print(f"Average F1 Score: {f1_avg:.4f}")

# 可视化结果
plt.figure(figsize=(10, 5))
plt.boxplot([precision_list, recall_list, f1_list], labels=['Precision', 'Recall', 'F1 Score'])
plt.title("Performance Metrics over 30 Experiments")
plt.show()

# Save final results to CSV (append mode)
try:
    # Attempt to check if the file already has a header
    existing_data = pd.read_csv(out_csv_name, nrows=1)
    header_needed = False
except:
    header_needed = True

df_log = pd.DataFrame(
    {
        'repeated_times': [REPEAT],
        'Accuracy': [accuracy_avg],
        'Precision': [precision_avg],
        'Recall': [recall_avg],
        'F1': [f1_avg],
    }
)

df_log.to_csv(out_csv_name, mode='a', header=header_needed, index=False)

print(f"\nResults have been saved to: {out_csv_name}")
    

<class 'pandas.core.frame.DataFrame'>
训练集：
output
0    0.885
1    0.115
Name: proportion, dtype: float64


TypeError: got an unexpected keyword argument 'zero_division'