In [3]:
import pandas as pd
import jieba
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier

In [4]:
df = pd.read_json('data.json', lines=True)
text = df['text'].to_list()
jieba_text = []
for i in tqdm(range(len(text))):
    t = " ".join(list(jieba.cut(text[i])))
    jieba_text.append(t)
label = df['label'].to_list()
label_to_idx = {
    "human": 0,
    "baichuan": 1,
    "qwen": 2
}
idx = [label_to_idx[i] for i in label]
X_train, X_test, y_train, y_test = train_test_split(jieba_text, idx, test_size=0.2, random_state=2023)

In [5]:
min_ngram = 1
max_ngram = 5
vectorizer = TfidfVectorizer(ngram_range=(min_ngram, max_ngram),sublinear_tf=True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print(X_train.shape, X_test.shape)

(16039, 5485023) (4010, 5485023)


In [6]:
def create_models(random_state=None):
    # Logistic Regression
    # lr_model = LogisticRegression(solver="liblinear")
    # SGD
    sgd_model = SGDClassifier(max_iter=5000, tol=1e-3, loss="modified_huber", 
                              random_state=random_state)  
    sgd_model2 = SGDClassifier(max_iter=5000, tol=1e-3, loss="modified_huber", 
                               random_state=(random_state + 1000) if random_state is not None else None, 
                               class_weight="balanced") 
    sgd_model3 = SGDClassifier(max_iter=10000, tol=5e-4, loss="modified_huber", 
                               random_state=(random_state + 2000) if random_state is not None else None, 
                               early_stopping=True)  
    # SVC
    # svc_model = SVC(probability=True)
    
    # Voting Classifier
    estimators=[
        ('sgd1', sgd_model), 
        ('sgd2', sgd_model2),
        ('sgd3', sgd_model3),
    ]
    # Create the ensemble model
    ensemble = VotingClassifier(
        estimators=estimators,
    #     weights=weights,
        voting='soft',
        verbose=0,
    )
    
    return ensemble

In [7]:
ensemble = create_models(0)
ensemble.fit(X_train, y_train)
preds_test = ensemble.predict(X_test)

In [8]:
right_num = 0
with open("result.csv", 'w', encoding='utf-8') as w:
    for i in range(len(preds_test)):
        if y_test[i] == preds_test[i]:
            right_num += 1
        w.write(f"{y_test[i]}\t{preds_test[i]}\n")
print(right_num / len(preds_test))

0.8413965087281795
