In [1]:
from utils import *
import warnings
warnings.filterwarnings('ignore')

train_data = read_train_data()
test_data = read_test_data()

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(train_data['content'].apply(lambda x: ' '.join(x)))
train_tfidf_feat = tfidf.transform(train_data['content'].apply(lambda x: ' '.join(x)))
test_tfidf_feat = tfidf.transform(test_data['content'].apply(lambda x: ' '.join(x)))

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
tfidf.fit(train_data['content'].apply(lambda x: ' '.join(x)))
train_tfidf_feat = tfidf.transform(train_data['content'].apply(lambda x: ' '.join(x)))
test_tfidf_feat = tfidf.transform(test_data['content'].apply(lambda x: ' '.join(x)))

In [3]:
train_data['content'].shape, train_tfidf_feat.shape

((14000,), (14000, 5000))

In [4]:
test_data['content'].shape

(10000,)

In [5]:
tfidf.get_feature_names_out()

array(['1000', '1000 123', '1000 1318', ..., '998 4982', '998 5212',
       '998 831'], dtype=object)

# 线性模型

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'max_iter': [100, 500, 1000]
}

model = LogisticRegression()
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(train_tfidf_feat, train_data['label'])

best = grid_search.best_estimator_

# 使用cross_val_predict()函数进行交叉验证
val_pred = cross_val_predict(
    LogisticRegression(**best.get_params()),
    train_tfidf_feat,
    train_data['label']
)
# 计算并输出分类报告
print(classification_report(train_data['label'], val_pred, digits=3))

              precision    recall  f1-score   support

           0      0.996     0.999     0.998     11836
           1      0.994     0.981     0.987      2164

    accuracy                          0.996     14000
   macro avg      0.995     0.990     0.993     14000
weighted avg      0.996     0.996     0.996     14000



In [7]:
# 生成测试集结果
m = LogisticRegression(**best.get_params())
m.fit(train_tfidf_feat, train_data['label'])
test_pred = m.predict(test_tfidf_feat)
test_data['label'] = test_pred
test_data[['name', 'label']].to_csv('lr.csv', index=None)

# Xgboost 模型

In [10]:
import xgboost as xgb
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.3, 0.5]
}

xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    cv=5,
)

grid_search.fit(train_tfidf_feat, train_data['label'])
best = grid_search.best_estimator_

val_pred = cross_val_predict(
    xgb.XGBClassifier(**best.get_params()),
    train_tfidf_feat,
    train_data['label']
)

print(classification_report(train_data['label'], val_pred, digits=3))

              precision    recall  f1-score   support

           0      0.995     0.997     0.996     11836
           1      0.985     0.971     0.978      2164

    accuracy                          0.993     14000
   macro avg      0.990     0.984     0.987     14000
weighted avg      0.993     0.993     0.993     14000



In [11]:
m = xgb.XGBClassifier(**best.get_params())
m.fit(train_tfidf_feat, train_data['label'])
test_pred = m.predict(test_tfidf_feat)
test_data['label'] = test_pred
test_data[['name', 'label']].to_csv('xgb.csv', index=None)