In [4]:
import pandas as pd
train_data = pd.read_csv('./data/eastmoney_bpr_train.csv', delimiter='<')
valid_data = pd.read_csv('./data/eastmoney_bpr_valid.csv', delimiter='<')
test_data = pd.read_csv('./data/eastmoney_bpr_test.csv')

In [5]:
train_data.columns

Index(['item_title', 'item_author_cate', 'article_author',
       'article_source_cate', 'month', 'eastmoney_robo_journalism',
       'media_robo_journalism', 'SMA_robo_journalism', 'viral',
       'sentiment_score', 'topics_val1', 'topics_val2', 'topics_val3',
       'topics_val4', 'topics_val5', 'stock_code', 'IndustryCode1',
       'IndustryName1', 'IndustryCode2', 'IndustryName2',
       'item_author_reduced', 'article_author_reduced',
       'article_source_reduced', 'stock_code_index', 'item_author_cate_index',
       'article_author_index', 'article_source_cate_index', 'month_index',
       'IndustryCode1_index', 'IndustryCode2_index',
       'eastmoney_robo_journalism_index', 'media_robo_journalism_index',
       'SMA_robo_journalism_index', 'item_author_reduced_index',
       'article_author_reduced_index', 'article_source_reduced_index',
       'neg_item_title', 'neg_item_author_cate', 'neg_article_author',
       'neg_article_source_cate', 'neg_month', 'neg_eastmoney_robo_jo

# Use machine learning models to predict virality

## XGboost

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ndcg_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

# 划分特征和目标变量
train_x = train_data[['']]
train_y = train_data['viral']
valid_x = valid_data[['']]
valid_y = valid_data['viral']
test_x = test_data[['']]
test_y = test_data['viral']

# 初始化分类器
model1 = xgb.XGBClassifier()

# 参数网格
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.3, 0.5, 0.7]
    }

# 设置GridSearchCV
grid_search = GridSearchCV(model1, param_grid, scoring='accuracy', cv=3, verbose=3)
grid_search.fit(train_x, train_y)

# 使用最佳参数的模型
best_model = grid_search.best_estimator_

# 建立XGBoost模型
# 从GridSearchCV获得最佳参数设置
model1 = xgb.XGBClassifier(
    learning_rate=best_model.learning_rate,
    n_estimators=best_model.n_estimators,
    max_depth= best_model.max_depth,
    subsample=best_model.subsample,
    colsample_bytree=best_model.colsample_bytree
    )
model1.fit(train_x, train_y)

# 预测
test_pred = model1.predict(test_x)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV 1/3] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8;, score=0.927 total time=   7.0s
[CV 2/3] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8;, score=0.927 total time=   5.6s
[CV 3/3] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8;, score=0.927 total time=   7.9s
[CV 1/3] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.9;, score=0.927 total time=   5.1s
[CV 2/3] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.9;, score=0.927 total time=   6.9s
[CV 3/3] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.9;, score=0.927 total time=   5.1s
[CV 1/3] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.927 total time=   5.0s
[CV 2/

In [None]:
# 评估：accuracy
print("Classification Report:\n", classification_report(test_y, test_pred))
conf_matrix = confusion_matrix(test_y, test_pred)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.99      0.98   1886055
         1.0       0.69      0.50      0.58     95054

    accuracy                           0.97   1981109
   macro avg       0.83      0.75      0.78   1981109
weighted avg       0.96      0.97      0.96   1981109

Confusion Matrix:
 [[1864714   21341]
 [  47320   47734]]


In [None]:
# 评估：ndcg
print("NDCG@1:\n", ndcg_score(test_y, test_pred, k=1))
print("NDCG@5:\n", ndcg_score(test_y, test_pred, k=5))
print("NDCG@10:\n", ndcg_score(test_y, test_pred, k=10))
print("NDCG:\n", ndcg_score(test_y, test_pred))

Feature Importance

In [None]:
# 获取特征的重要性
feature_importance = model1.feature_importances_

# 将特征重要性与特征名称进行配对，并排序
feature_importance_dict = dict(zip(X1.columns, feature_importance))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# 打印特征重要性
print("Feature importances:\n", sorted_feature_importance)

Feature importances:
 [('eastmoney_robo_journalism', 0.28605458), ('dominant_topic', 0.25442684), ('article_source_cate', 0.14622656), ('SMA_robo_journalism', 0.12381081), ('media_robo_journalism', 0.0749477), ('exclamation_mark', 0.03831342), ('colon_mark', 0.031960607), ('article_author', 0.022018924), ('month', 0.009027952), ('question_mark', 0.008393501), ('sentiment_score', 0.004819168)]


## Random Forest

Due to network issues, the process of choosing best hyparameters was run in three separate runs.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# 数据准备
X3 = selected_data
y3 = eastmoney['viral']

# 划分训练集和测试集
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=42)

# 参数范围设置
max_depth_options = [10, 20]
n_estimators_options = [50, 80, 100, 150, 200]

best_score = 0
best_params = {'max_depth': None, 'n_estimators': 100}

for max_depth in max_depth_options:
    for n_estimators in n_estimators_options:
        # 初始化随机森林分类器
        model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=42)

        # 使用交叉验证计算评分
        scores = cross_val_score(model, X3_train, y3_train, cv=5, scoring='accuracy')
        average_score = np.mean(scores)

        print(f"Testing {n_estimators} trees with max depth of {max_depth}. Mean CV Accuracy: {average_score}")

        # 检查并更新最佳得分和参数
        if average_score > best_score:
            best_score = average_score
            best_params = {'max_depth': max_depth, 'n_estimators': n_estimators}

Testing 50 trees with max depth of 10. Mean CV Accuracy: 0.9609995441473013
Testing 80 trees with max depth of 10. Mean CV Accuracy: 0.9608366483388011
Testing 100 trees with max depth of 10. Mean CV Accuracy: 0.9609056573044837
Testing 150 trees with max depth of 10. Mean CV Accuracy: 0.9610417283783352
Testing 200 trees with max depth of 10. Mean CV Accuracy: 0.9611682808418918
Testing 50 trees with max depth of 20. Mean CV Accuracy: 0.9645174877188216
Testing 80 trees with max depth of 20. Mean CV Accuracy: 0.9645678924000126
Testing 100 trees with max depth of 20. Mean CV Accuracy: 0.9645996927562314
Testing 150 trees with max depth of 20. Mean CV Accuracy: 0.9646053173199325


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# 数据准备
X3 = selected_data
y3 = eastmoney['viral']

# 划分训练集和测试集
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=42)

# 参数范围设置
max_depth_options = 30
n_estimators_options = [50, 80, 100, 150]

best_score = 0
best_params = {'max_depth': None, 'n_estimators': 100}

for n_estimators in n_estimators_options:
    # 初始化随机森林分类器
    model = RandomForestClassifier(max_depth=30, n_estimators=n_estimators, random_state=42)

    # 使用交叉验证计算评分
    scores = cross_val_score(model, X3_train, y3_train, cv=5, scoring='accuracy')
    average_score = np.mean(scores)

    print(f"Testing {n_estimators} trees with max depth of 30. Mean CV Accuracy: {average_score}")

Testing 50 trees with max depth of 30. Mean CV Accuracy: 0.9602871725386375
Testing 80 trees with max depth of 30. Mean CV Accuracy: 0.9603044788586083


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

X3 = selected_data
y3 = eastmoney['viral']

# 划分训练集和测试集
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=42)

# 参数范围设置
max_depth_options = 30
n_estimators_options = [100, 150]

best_score = 0
best_params = {'max_depth': None, 'n_estimators': 100}

for n_estimators in n_estimators_options:
    # 初始化随机森林分类器
    model = RandomForestClassifier(max_depth=30, n_estimators=n_estimators, random_state=42)

    # 使用交叉验证计算评分
    scores = cross_val_score(model, X3_train, y3_train, cv=5, scoring='accuracy')
    average_score = np.mean(scores)

    print(f"Testing {n_estimators} trees with max depth of 30. Mean CV Accuracy: {average_score}")

Testing 100 trees with max depth of 30. Mean CV Accuracy: 0.9603362792253567
Testing 150 trees with max depth of 30. Mean CV Accuracy: 0.9603241647952935


In [None]:
# 使用最佳参数训练模型
best_model = RandomForestClassifier(max_depth=20, n_estimators=80, random_state=42)
best_model.fit(X3_train, y3_train)
y3_pred = best_model.predict(X3_test)

# 评估
print("Classification Report:\n", classification_report(y3_test, y3_pred))
conf_matrix = confusion_matrix(y3_test, y3_pred)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.99      0.98   1886055
         1.0       0.70      0.46      0.56     95054

    accuracy                           0.96   1981109
   macro avg       0.84      0.73      0.77   1981109
weighted avg       0.96      0.96      0.96   1981109

Confusion Matrix:
 [[1866855   19200]
 [  50934   44120]]


In [None]:
# 获取特征的重要性
feature_importance = best_model.feature_importances_

# 将特征重要性与特征名称进行配对，并排序
feature_importance_dict = dict(zip(X3.columns, feature_importance))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

print("Feature importances:\n", sorted_feature_importance)

Feature importances:
 [('article_source_cate', 0.41732270291966256), ('sentiment_score', 0.12940403496400923), ('dominant_topic', 0.12905900539845616), ('article_author', 0.07496864730387429), ('SMA_robo_journalism', 0.05873889168302253), ('media_robo_journalism', 0.050267501669162495), ('month', 0.049556268619000206), ('colon_mark', 0.036353485346739024), ('eastmoney_robo_journalism', 0.03463149453425724), ('exclamation_mark', 0.014866902239271038), ('question_mark', 0.004831065322545256)]


## Logistic Regression

Add class weights

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import class_weight
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd

selected_data = eastmoney[columns]
selected_data.columns = selected_data.columns.str.strip()

# 对 dominant_topic 进行独热编码
encoder = OneHotEncoder(sparse_output=False)
dominant_topic_encoded = encoder.fit_transform(selected_data[['dominant_topic']])
encoded_df = pd.DataFrame(dominant_topic_encoded, columns=[f"topic_{i}" for i in range(dominant_topic_encoded.shape[1])])
selected_data = pd.concat([selected_data.drop('dominant_topic', axis=1), encoded_df], axis=1)

X2 = selected_data
y2 = eastmoney['viral']

# 标准化
scaler = StandardScaler()
sentiment_scores = X2['sentiment_score'].values.reshape(-1, 1)
X2['sentiment_score'] = scaler.fit_transform(sentiment_scores)

# 划分训练集和测试集
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

# 计算类权重
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y2_train), y=y2_train)
weights = {i : class_weights[i] for i in range(len(class_weights))}

# 将类权重添加到逻辑回归模型
logistic_model = LogisticRegression(class_weight=weights)

# 可能选择更多特征
selector = RFE(logistic_model, n_features_to_select=5, step=1)
selector.fit(X2_train, y2_train)


# 定义要搜索的参数网格
param_grid = {
    'C': np.logspace(-4, 4, 10),
    'penalty': ['l2'],
    'solver': ['liblinear', 'lbfgs']
}

# 设置GridSearchCV
grid_search = GridSearchCV(logistic_model, param_grid, cv=3, scoring='accuracy', verbose=3)
grid_search.fit(X2_train, y2_train)

# 使用最佳参数的模型
best_model = grid_search.best_estimator_

# 预测
y2_pred = best_model.predict(X2_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3] END C=9.999999999999999e-05, penalty=l2, solver=liblinear;, score=0.851 total time=  15.7s
[CV 2/3] END C=9.999999999999999e-05, penalty=l2, solver=liblinear;, score=0.850 total time=  15.7s
[CV 3/3] END C=9.999999999999999e-05, penalty=l2, solver=liblinear;, score=0.850 total time=  15.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=9.999999999999999e-05, penalty=l2, solver=lbfgs;, score=0.838 total time=  21.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=9.999999999999999e-05, penalty=l2, solver=lbfgs;, score=0.838 total time=  21.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=9.999999999999999e-05, penalty=l2, solver=lbfgs;, score=0.837 total time=  20.3s
[CV 1/3] END C=0.000774263682681127, penalty=l2, solver=liblinear;, score=0.853 total time=  16.2s
[CV 2/3] END C=0.000774263682681127, penalty=l2, solver=liblinear;, score=0.853 total time=  16.6s
[CV 3/3] END C=0.000774263682681127, penalty=l2, solver=liblinear;, score=0.853 total time=  16.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=0.000774263682681127, penalty=l2, solver=lbfgs;, score=0.839 total time=  22.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=0.000774263682681127, penalty=l2, solver=lbfgs;, score=0.839 total time=  21.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=0.000774263682681127, penalty=l2, solver=lbfgs;, score=0.825 total time=  20.7s
[CV 1/3] END C=0.005994842503189409, penalty=l2, solver=liblinear;, score=0.852 total time=  16.3s
[CV 2/3] END C=0.005994842503189409, penalty=l2, solver=liblinear;, score=0.852 total time=  15.9s
[CV 3/3] END C=0.005994842503189409, penalty=l2, solver=liblinear;, score=0.852 total time=  16.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=0.005994842503189409, penalty=l2, solver=lbfgs;, score=0.839 total time=  20.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=0.005994842503189409, penalty=l2, solver=lbfgs;, score=0.839 total time=  22.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=0.005994842503189409, penalty=l2, solver=lbfgs;, score=0.838 total time=  21.1s
[CV 1/3] END C=0.046415888336127774, penalty=l2, solver=liblinear;, score=0.852 total time=  16.3s
[CV 2/3] END C=0.046415888336127774, penalty=l2, solver=liblinear;, score=0.852 total time=  15.9s
[CV 3/3] END C=0.046415888336127774, penalty=l2, solver=liblinear;, score=0.852 total time=  16.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=0.046415888336127774, penalty=l2, solver=lbfgs;, score=0.839 total time=  21.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=0.046415888336127774, penalty=l2, solver=lbfgs;, score=0.831 total time=  20.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=0.046415888336127774, penalty=l2, solver=lbfgs;, score=0.838 total time=  20.1s
[CV 1/3] END C=0.3593813663804626, penalty=l2, solver=liblinear;, score=0.852 total time=  16.6s
[CV 2/3] END C=0.3593813663804626, penalty=l2, solver=liblinear;, score=0.852 total time=  15.7s
[CV 3/3] END C=0.3593813663804626, penalty=l2, solver=liblinear;, score=0.852 total time=  16.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=0.3593813663804626, penalty=l2, solver=lbfgs;, score=0.839 total time=  19.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=0.3593813663804626, penalty=l2, solver=lbfgs;, score=0.839 total time=  21.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=0.3593813663804626, penalty=l2, solver=lbfgs;, score=0.838 total time=  20.1s
[CV 1/3] END C=2.782559402207126, penalty=l2, solver=liblinear;, score=0.852 total time=  16.7s
[CV 2/3] END C=2.782559402207126, penalty=l2, solver=liblinear;, score=0.852 total time=  15.9s
[CV 3/3] END C=2.782559402207126, penalty=l2, solver=liblinear;, score=0.852 total time=  19.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=2.782559402207126, penalty=l2, solver=lbfgs;, score=0.838 total time=  19.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=2.782559402207126, penalty=l2, solver=lbfgs;, score=0.839 total time=  21.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=2.782559402207126, penalty=l2, solver=lbfgs;, score=0.838 total time=  19.9s
[CV 1/3] END C=21.54434690031882, penalty=l2, solver=liblinear;, score=0.852 total time=  16.8s
[CV 2/3] END C=21.54434690031882, penalty=l2, solver=liblinear;, score=0.852 total time=  16.0s
[CV 3/3] END C=21.54434690031882, penalty=l2, solver=liblinear;, score=0.852 total time=  16.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=21.54434690031882, penalty=l2, solver=lbfgs;, score=0.839 total time=  19.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=21.54434690031882, penalty=l2, solver=lbfgs;, score=0.839 total time=  21.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=21.54434690031882, penalty=l2, solver=lbfgs;, score=0.838 total time=  20.0s
[CV 1/3] END C=166.81005372000556, penalty=l2, solver=liblinear;, score=0.852 total time=  16.6s
[CV 2/3] END C=166.81005372000556, penalty=l2, solver=liblinear;, score=0.852 total time=  15.8s
[CV 3/3] END C=166.81005372000556, penalty=l2, solver=liblinear;, score=0.852 total time=  16.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=166.81005372000556, penalty=l2, solver=lbfgs;, score=0.839 total time=  19.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=166.81005372000556, penalty=l2, solver=lbfgs;, score=0.839 total time=  20.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=166.81005372000556, penalty=l2, solver=lbfgs;, score=0.838 total time=  20.4s
[CV 1/3] END C=1291.5496650148827, penalty=l2, solver=liblinear;, score=0.852 total time=  16.6s
[CV 2/3] END C=1291.5496650148827, penalty=l2, solver=liblinear;, score=0.852 total time=  16.0s
[CV 3/3] END C=1291.5496650148827, penalty=l2, solver=liblinear;, score=0.852 total time=  16.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=1291.5496650148827, penalty=l2, solver=lbfgs;, score=0.839 total time=  19.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=1291.5496650148827, penalty=l2, solver=lbfgs;, score=0.839 total time=  21.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=1291.5496650148827, penalty=l2, solver=lbfgs;, score=0.838 total time=  20.5s
[CV 1/3] END C=10000.0, penalty=l2, solver=liblinear;, score=0.852 total time=  17.0s
[CV 2/3] END C=10000.0, penalty=l2, solver=liblinear;, score=0.852 total time=  16.1s
[CV 3/3] END C=10000.0, penalty=l2, solver=liblinear;, score=0.852 total time=  16.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END C=10000.0, penalty=l2, solver=lbfgs;, score=0.840 total time=  21.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END C=10000.0, penalty=l2, solver=lbfgs;, score=0.839 total time=  22.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END C=10000.0, penalty=l2, solver=lbfgs;, score=0.838 total time=  21.6s


In [None]:
# 评估
print("Classification Report:\n", classification_report(y2_test, y2_pred))
conf_matrix = confusion_matrix(y2_test, y2_pred)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.85      0.92   1886055
         1.0       0.23      0.91      0.37     95054

    accuracy                           0.85   1981109
   macro avg       0.61      0.88      0.64   1981109
weighted avg       0.96      0.85      0.89   1981109

Confusion Matrix:
 [[1602520  283535]
 [   8921   86133]]


Feature importance

In [None]:
feature_importance = list(zip(X2_train.columns, selector.ranking_))
feature_importance.sort(key=lambda x: x[1])
print("Feature importances:\n", feature_importance)

Feature importances:
 [('eastmoney_robo_journalism', 1), ('SMA_robo_journalism', 1), ('colon_mark', 1), ('topic_0', 1), ('topic_4', 1), ('topic_3', 2), ('media_robo_journalism', 3), ('topic_2', 4), ('exclamation_mark', 5), ('question_mark', 6), ('topic_1', 7), ('month', 8), ('sentiment_score', 9), ('article_source_cate', 10), ('article_author', 11)]


Without class weights

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import class_weight
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

selected_data = eastmoney[columns]
selected_data.columns = selected_data.columns.str.strip()

# 对 dominant_topic 进行独热编码
encoder = OneHotEncoder(sparse_output=False)
dominant_topic_encoded = encoder.fit_transform(selected_data[['dominant_topic']])
encoded_df = pd.DataFrame(dominant_topic_encoded, columns=[f"topic_{i}" for i in range(dominant_topic_encoded.shape[1])])
selected_data = pd.concat([selected_data.drop('dominant_topic', axis=1), encoded_df], axis=1)

X2 = selected_data
y2 = eastmoney['viral']

# 使用标准化
scaler = StandardScaler()
sentiment_scores = X2['sentiment_score'].values.reshape(-1, 1)
X2['sentiment_score'] = scaler.fit_transform(sentiment_scores)

# 划分训练集和测试集
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

logistic_model = LogisticRegression(max_iter=1000)

# 可能选择更多特征
selector = RFE(logistic_model, n_features_to_select=5, step=1)
selector.fit(X2_train, y2_train)


# 定义要搜索的参数网格
param_grid = {
    'C': np.logspace(-4, 4, 10),
    'penalty': ['l2'],
    'solver': ['liblinear', 'lbfgs']
}

# 设置GridSearchCV
grid_search = GridSearchCV(logistic_model, param_grid, cv=3, scoring='accuracy', verbose=3)
grid_search.fit(X2_train, y2_train)

# 使用最佳参数的模型
best_model = grid_search.best_estimator_

# 预测
y2_pred = best_model.predict(X2_test)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3] END C=9.999999999999999e-05, penalty=l2, solver=liblinear;, score=0.952 total time=  17.6s
[CV 2/3] END C=9.999999999999999e-05, penalty=l2, solver=liblinear;, score=0.952 total time=  18.5s
[CV 3/3] END C=9.999999999999999e-05, penalty=l2, solver=liblinear;, score=0.952 total time=  17.4s
[CV 1/3] END C=9.999999999999999e-05, penalty=l2, solver=lbfgs;, score=0.952 total time=  30.7s
[CV 2/3] END C=9.999999999999999e-05, penalty=l2, solver=lbfgs;, score=0.952 total time=  51.9s
[CV 3/3] END C=9.999999999999999e-05, penalty=l2, solver=lbfgs;, score=0.952 total time= 1.0min
[CV 1/3] END C=0.000774263682681127, penalty=l2, solver=liblinear;, score=0.952 total time=  18.5s
[CV 2/3] END C=0.000774263682681127, penalty=l2, solver=liblinear;, score=0.952 total time=  18.6s
[CV 3/3] END C=0.000774263682681127, penalty=l2, solver=liblinear;, score=0.952 total time=  18.6s
[CV 1/3] END C=0.000774263682681127, penalty=l2, solve

In [None]:
# 评估
print("Classification Report:\n", classification_report(y2_test, y2_pred))
conf_matrix = confusion_matrix(y2_test, y2_pred)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      1.00      0.98   1886055
         1.0       0.55      0.04      0.07     95054

    accuracy                           0.95   1981109
   macro avg       0.75      0.52      0.52   1981109
weighted avg       0.93      0.95      0.93   1981109

Confusion Matrix:
 [[1883050    3005]
 [  91393    3661]]


Feature importance

In [None]:
feature_importance = list(zip(X2_train.columns, selector.ranking_))
feature_importance.sort(key=lambda x: x[1])
print("Feature importances:\n", feature_importance)

Feature importances:
 [('eastmoney_robo_journalism', 1), ('SMA_robo_journalism', 1), ('colon_mark', 1), ('topic_3', 1), ('topic_4', 1), ('topic_0', 2), ('media_robo_journalism', 3), ('topic_1', 4), ('question_mark', 5), ('topic_2', 6), ('exclamation_mark', 7), ('sentiment_score', 8), ('month', 9), ('article_source_cate', 10), ('article_author', 11)]
