In [1]:
%%time
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import warnings

warnings.filterwarnings('ignore')

# 加载数据
r = pd.read_csv('E:/6个数据集数据汇总/weibo_rumor_all.csv')
t = pd.read_csv('E:/6个数据集数据汇总/weibo_true_all.csv')
r['label'] = 1
t['label'] = 0
df = pd.concat([t, r], ignore_index=True)

# 特征归一化
df1 = df.iloc[:, 2:-1].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df7 = df1[['m3_1', 'm3_2', 'm4_1', 'm4_2', 
           'm4_3', 'm4_4', 'm5_1', 'm5_2', 
           'm5_3', 'm5_4', 'm5_5', 'm5_6', 
           'm5_7', 'm5_8', 'm5_9']]
label = df['label']

# 初始化结果列表
results = []

# 多轮实验
for iteration in range(50):
    print(f'---------- Iteration {iteration+1} ----------')
    
    # 划分训练与测试集
    train_x, test_x, train_y, test_y = train_test_split(df7, label, test_size=0.2, stratify=label, random_state=iteration)

    # 不做过采样，直接用原始训练集计算 scale_pos_weight
    scale_weight = train_y.value_counts()[0] / train_y.value_counts()[1]

    # 超参数网格
    param_grid = {
        'n_estimators': [100, 150],
        'max_depth': [6,8,10],
        'min_child_weight': [1, 2, 4],
        'learning_rate': [0.01,0.05, 0.1],
        'subsample': [0.8, 1.0],
        'reg_lambda': [0.1, 1.0]
    }

    xgb_model = XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_weight
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=iteration)

    grid = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=0
    )

    grid.fit(train_x, train_y)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(test_x)

    results.append({
        'Iteration': iteration + 1,
        'Accuracy': round(accuracy_score(test_y, y_pred), 3),
        'Recall': round(recall_score(test_y, y_pred), 3),
        'F1 Score': round(f1_score(test_y, y_pred), 3),
        'Precision': round(precision_score(test_y, y_pred), 3),
        'Best Params': grid.best_params_
    })

# 保存结果
results_df = pd.DataFrame(results)
results_df.to_csv('MMTD_results_weighted_no_oversample_5fold.csv', index=False)


---------- Iteration 1 ----------


KeyboardInterrupt: 

In [10]:
%%time
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import warnings

warnings.filterwarnings('ignore')

# 加载数据
r1 = pd.read_csv('E:/6个数据集数据汇总/politifact_fake.csv')
t = pd.read_csv('E:/6个数据集数据汇总/politifact_real.csv')
r = r1.iloc[:220]

r['label'] = 1
t['label'] = 0
df = pd.concat([t, r], ignore_index=True)

# 特征归一化
df1 = df.iloc[:, 2:-1].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df7 = df1[['m3_1', 'm3_2', 'm4_1', 'm4_2', 
           'm4_3', 'm4_4', 'm5_1', 'm5_2', 
           'm5_3', 'm5_4', 'm5_5', 'm5_6', 
           'm5_7', 'm5_8', 'm5_9']]
label = df['label']

# 初始化结果列表
results = []
# 多轮实验
for iteration in range(20):
    print(f'---------- Iteration {iteration+1} ----------')
    
    # 划分训练与测试集
    train_x, test_x, train_y, test_y = train_test_split(df7, label, test_size=0.2, stratify=label, random_state=iteration)

    # 固定经验 scale_pos_weight
    scale_weight = 5

    # 扩大模型容量，加入正则项
    param_grid = {
        'n_estimators': [200, 300],
        'max_depth': [5, 10],
        'min_child_weight': [2, 4],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0],
        'reg_lambda': [0.1, 1.0],
        'reg_alpha': [0.1, 1.0],
        'scale_pos_weight': [scale_weight]
    }

    xgb_model = XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss'
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=iteration)

    grid = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=cv,
        scoring='accuracy',  # 改为准确率优化
        n_jobs=-1,
        verbose=0
    )

    grid.fit(
        train_x, train_y
       
    )

    best_model = grid.best_estimator_
    y_pred = best_model.predict(test_x)

    results.append({
        'Iteration': iteration + 1,
        'Accuracy': round(accuracy_score(test_y, y_pred), 3),
        'Recall': round(recall_score(test_y, y_pred), 3),
        'F1 Score': round(f1_score(test_y, y_pred), 3),
        'Precision': round(precision_score(test_y, y_pred), 3),
        'Best Params': grid.best_params_
    })
# 保存结果
results_df = pd.DataFrame(results)
results_df.to_csv('MMTD_politifact_weighted_no_oversample_5fold1.csv', index=False)

---------- Iteration 1 ----------
---------- Iteration 2 ----------
---------- Iteration 3 ----------
---------- Iteration 4 ----------
---------- Iteration 5 ----------
---------- Iteration 6 ----------
---------- Iteration 7 ----------
---------- Iteration 8 ----------
---------- Iteration 9 ----------
---------- Iteration 10 ----------
---------- Iteration 11 ----------
---------- Iteration 12 ----------
---------- Iteration 13 ----------
---------- Iteration 14 ----------
---------- Iteration 15 ----------
---------- Iteration 16 ----------
---------- Iteration 17 ----------
---------- Iteration 18 ----------
---------- Iteration 19 ----------
---------- Iteration 20 ----------
CPU times: total: 35.1 s
Wall time: 9min 46s


In [15]:
%%time
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import RandomOverSampler
import warnings

warnings.filterwarnings('ignore')

# 加载数据
r = pd.read_csv('E:/6个数据集数据汇总/gossipcop_fake.csv')
t = pd.read_csv('E:/6个数据集数据汇总/gossipcop_real.csv')
r['label'] = 1
t['label'] = 0
df = pd.concat([t, r], ignore_index=True)

# 特征归一化
df1 = df.iloc[:, 2:-1].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df7 = df1[['m3_1', 'm3_2', 'm4_1', 'm4_2', 
           'm4_3', 'm4_4', 'm5_1', 'm5_2', 
           'm5_3', 'm5_4', 'm5_5', 'm5_6', 
           'm5_7', 'm5_8', 'm5_9']]
label = df['label']

# 初始化结果列表
results = []

# 多轮实验
for iteration in range(20):
    print(f'---------- Iteration {iteration+1} ----------')

    # 划分训练与测试集
    train_x, test_x, train_y, test_y = train_test_split(df7, label, test_size=0.2, stratify=label, random_state=iteration)

    # 轻度过采样：让少数类达到多数类的80%
    ros = RandomOverSampler(sampling_strategy=0.75, random_state=iteration)
    train_x_res, train_y_res = ros.fit_resample(train_x, train_y)

    # 根据过采样后的训练集重新计算 scale_pos_weight
    scale_weight = train_y_res.value_counts()[0] / train_y_res.value_counts()[1]

    # 超参数网格
    param_grid = {
        'n_estimators': [100, 150],
        'max_depth': [5, 10],
        'min_child_weight': [2, 4],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0],
        'reg_lambda': [0.1, 1.0]
    }

    xgb_model = XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_weight
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=iteration)

    grid = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=0
    )

    grid.fit(train_x_res, train_y_res)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(test_x)

    results.append({
        'Iteration': iteration + 1,
        'Accuracy': round(accuracy_score(test_y, y_pred), 3),
        'Recall': round(recall_score(test_y, y_pred), 3),
        'F1 Score': round(f1_score(test_y, y_pred), 3),
        'Precision': round(precision_score(test_y, y_pred), 3),
        'Best Params': grid.best_params_
    })

# 保存结果
results_df = pd.DataFrame(results)
results_df.to_csv('MMTD_gossipcop_weighted_oversample_5fold.csv', index=False)


---------- Iteration 1 ----------
---------- Iteration 2 ----------
---------- Iteration 3 ----------
---------- Iteration 4 ----------
---------- Iteration 5 ----------
---------- Iteration 6 ----------
---------- Iteration 7 ----------
---------- Iteration 8 ----------
---------- Iteration 9 ----------
---------- Iteration 10 ----------
---------- Iteration 11 ----------
---------- Iteration 12 ----------
---------- Iteration 13 ----------
---------- Iteration 14 ----------
---------- Iteration 15 ----------
---------- Iteration 16 ----------
---------- Iteration 17 ----------
---------- Iteration 18 ----------
---------- Iteration 19 ----------
---------- Iteration 20 ----------
CPU times: total: 22min 52s
Wall time: 9min 59s
