Stage 1: Use machine labeling on the unlabeled dataset. Among them, there are 70 samples labeled by four people, 100 samples labeled by two people, and 190 samples set aside as the final test set. Therefore, the number of samples to be labeled is 598, and after removing one duplicate, there are 597 samples.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df=pd.read_csv('raw_with_170labels_version2.csv')
df.head()
duplicate_texts = df[df.duplicated(subset='text', keep=False)]

print(f"total {len(duplicate_texts)} duplicate_texts")
display(duplicate_texts.head(10))

total 0 duplicate_texts


Unnamed: 0,text,label


In [2]:
df.shape

(957, 2)

In [3]:
df_train = df.iloc[:170].copy()
df_test = df.iloc[170:].copy()
df_train.shape, df_test.shape

((170, 2), (787, 2))

In [4]:
df_train["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0.0,91
1.0,55
2.0,24


Change parameter: vectorizer = TfidfVectorizer(max_features=-, ngram_range=(1, -)), choose which one is the best

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
import nltk
import spacy

nltk.download('stopwords')
from nltk.corpus import stopwords

nlp = spacy.load('en_core_web_sm')

df_train = df.iloc[:170].copy()
df_unlabeled = df.iloc[170:].copy()

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    stop_words = set(stopwords.words('english'))
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc if token.text not in stop_words])

    return text
df_train['processed_text'] = df_train['text'].apply(preprocess_text)
df_unlabeled['processed_text'] = df_unlabeled['text'].apply(preprocess_text)
X = df_train['processed_text']
y = df_train['label']

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
X_tfidf = vectorizer.fit_transform(X)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

logreg = LogisticRegression(max_iter=1000)
svm = SVC(kernel='linear', random_state=42, probability=True)  # SVM with probability enabled

param_grid_logreg = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, n_jobs=-1)
grid_search_logreg.fit(X_resampled, y_resampled)
best_logreg = grid_search_logreg.best_estimator_

param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, n_jobs=-1)
grid_search_svm.fit(X_resampled, y_resampled)
best_svm = grid_search_svm.best_estimator_

voting_clf1 = VotingClassifier(estimators=[('logreg', best_logreg), ('svm', best_svm)], voting='soft')

voting_clf1.fit(X_resampled, y_resampled)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

val_preds, val_true = [], []

for train_idx, val_idx in skf.split(X_resampled, y_resampled):
    X_train, X_val = X_resampled[train_idx], X_resampled[val_idx]
    y_train, y_val = y_resampled.iloc[train_idx], y_resampled.iloc[val_idx]

    voting_clf1.fit(X_train, y_train)
    preds = voting_clf1.predict(X_val)

    val_preds.extend(preds)
    val_true.extend(y_val)

print(classification_report(val_true, val_preds, digits=4))

X_unlabeled_tfidf = vectorizer.transform(df_unlabeled['processed_text'])

probs = voting_clf1.predict_proba(X_unlabeled_tfidf)  # shape: [n_samples, n_classes]

confidences = probs.max(axis=1)

preds = voting_clf1.predict(X_unlabeled_tfidf)

df_unlabeled['pseudo_label'] = preds
df_unlabeled['confidence'] = confidences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

         0.0     0.7500    0.9560    0.8406        91
         1.0     0.9420    0.7143    0.8125        91
         2.0     1.0000    0.9670    0.9832        91

    accuracy                         0.8791       273
   macro avg     0.8973    0.8791    0.8788       273
weighted avg     0.8973    0.8791    0.8788       273



In [25]:
high_conf_df = df_unlabeled[df_unlabeled['confidence'] >= 0.7]
low_conf_df = df_unlabeled[df_unlabeled['confidence'] <= 0.5]
print(f"\nNumber of pseudo-labeled samples with confidence greater than 0.7: {len(high_conf_df)} / {len(df_unlabeled)}")
print(f"\nNumber of pseudo-labeled samples with confidence less than or equal to 0.5: {len(low_conf_df)} / {len(df_unlabeled)}")


Number of pseudo-labeled samples with confidence greater than 0.7: 168 / 787

Number of pseudo-labeled samples with confidence less than or equal to 0.5: 222 / 787


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
import nltk
import spacy

nltk.download('stopwords')
from nltk.corpus import stopwords

nlp = spacy.load('en_core_web_sm')

df_train = df.iloc[:170].copy()
df_unlabeled = df.iloc[170:].copy()

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    stop_words = set(stopwords.words('english'))
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc if token.text not in stop_words])

    return text

df_train['processed_text'] = df_train['text'].apply(preprocess_text)
df_unlabeled['processed_text'] = df_unlabeled['text'].apply(preprocess_text)

X = df_train['processed_text']
y = df_train['label']

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  
X_tfidf = vectorizer.fit_transform(X)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

logreg = LogisticRegression(max_iter=1000)
svm = SVC(kernel='linear', random_state=42, probability=True)  # SVM with probability enabled

param_grid_logreg = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, n_jobs=-1)
grid_search_logreg.fit(X_resampled, y_resampled)
best_logreg = grid_search_logreg.best_estimator_

param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, n_jobs=-1)
grid_search_svm.fit(X_resampled, y_resampled)
best_svm = grid_search_svm.best_estimator_

voting_clf1 = VotingClassifier(estimators=[('logreg', best_logreg), ('svm', best_svm)], voting='soft')

voting_clf1.fit(X_resampled, y_resampled)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

val_preds, val_true = [], []

for train_idx, val_idx in skf.split(X_resampled, y_resampled):
    X_train, X_val = X_resampled[train_idx], X_resampled[val_idx]
    y_train, y_val = y_resampled.iloc[train_idx], y_resampled.iloc[val_idx]

    voting_clf1.fit(X_train, y_train)
    preds = voting_clf1.predict(X_val)

    val_preds.extend(preds)
    val_true.extend(y_val)

print(classification_report(val_true, val_preds, digits=4))

X_unlabeled_tfidf = vectorizer.transform(df_unlabeled['processed_text'])

probs = voting_clf1.predict_proba(X_unlabeled_tfidf)  # shape: [n_samples, n_classes]

confidences = probs.max(axis=1)

preds = voting_clf1.predict(X_unlabeled_tfidf)

df_unlabeled['pseudo_label1'] = preds
df_unlabeled['confidence1'] = confidences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

         0.0     0.7944    0.9341    0.8586        91
         1.0     0.9221    0.7802    0.8452        91
         2.0     1.0000    0.9780    0.9889        91

    accuracy                         0.8974       273
   macro avg     0.9055    0.8974    0.8976       273
weighted avg     0.9055    0.8974    0.8976       273



In [27]:
high_conf_df1 = df_unlabeled[df_unlabeled['confidence1'] >= 0.7]
low_conf_df1 = df_unlabeled[df_unlabeled['confidence1'] <= 0.5]
print(f"\nNumber of pseudo-labeled samples with confidence greater than 0.7: {len(high_conf_df1)} / {len(df_unlabeled)}")
print(f"\nNumber of pseudo-labeled samples with confidence less than or equal to 0.5: {len(low_conf_df1)} / {len(df_unlabeled)}")


Number of pseudo-labeled samples with confidence greater than 0.7: 246 / 787

Number of pseudo-labeled samples with confidence less than or equal to 0.5: 177 / 787


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
import nltk
import spacy


nltk.download('stopwords')
from nltk.corpus import stopwords


nlp = spacy.load('en_core_web_sm')


df_train = df.iloc[:170].copy()
df_unlabeled = df.iloc[170:].copy()


def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    stop_words = set(stopwords.words('english'))
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc if token.text not in stop_words])

    return text

df_train['processed_text'] = df_train['text'].apply(preprocess_text)
df_unlabeled['processed_text'] = df_unlabeled['text'].apply(preprocess_text)

X = df_train['processed_text']
y = df_train['label']

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 1))
X_tfidf = vectorizer.fit_transform(X)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

logreg = LogisticRegression(max_iter=1000)
svm = SVC(kernel='linear', random_state=42, probability=True)  # SVM with probability enabled

param_grid_logreg = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, n_jobs=-1)
grid_search_logreg.fit(X_resampled, y_resampled)
best_logreg = grid_search_logreg.best_estimator_
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, n_jobs=-1)
grid_search_svm.fit(X_resampled, y_resampled)
best_svm = grid_search_svm.best_estimator_
voting_clf1 = VotingClassifier(estimators=[('logreg', best_logreg), ('svm', best_svm)], voting='soft')

voting_clf1.fit(X_resampled, y_resampled)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

val_preds, val_true = [], []

for train_idx, val_idx in skf.split(X_resampled, y_resampled):
    X_train, X_val = X_resampled[train_idx], X_resampled[val_idx]
    y_train, y_val = y_resampled.iloc[train_idx], y_resampled.iloc[val_idx]

    voting_clf1.fit(X_train, y_train)
    preds = voting_clf1.predict(X_val)

    val_preds.extend(preds)
    val_true.extend(y_val)

print(classification_report(val_true, val_preds, digits=4))

X_unlabeled_tfidf = vectorizer.transform(df_unlabeled['processed_text'])

probs = voting_clf1.predict_proba(X_unlabeled_tfidf)  # shape: [n_samples, n_classes]

confidences = probs.max(axis=1)

preds = voting_clf1.predict(X_unlabeled_tfidf)

df_unlabeled['pseudo_label2'] = preds
df_unlabeled['confidence2'] = confidences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

         0.0     0.7959    0.8571    0.8254        91
         1.0     0.8506    0.8132    0.8315        91
         2.0     1.0000    0.9670    0.9832        91

    accuracy                         0.8791       273
   macro avg     0.8822    0.8791    0.8800       273
weighted avg     0.8822    0.8791    0.8800       273



In [29]:
high_conf_df2 = df_unlabeled[df_unlabeled['confidence2'] >= 0.7]
low_conf_df2 = df_unlabeled[df_unlabeled['confidence2'] <= 0.5]
print(f"\nNumber of pseudo-labeled samples with confidence greater than 0.7: {len(high_conf_df2)} / {len(df_unlabeled)}")
print(f"\nNumber of pseudo-labeled samples with confidence less than or equal to 0.5: {len(low_conf_df2)} / {len(df_unlabeled)}")


Number of pseudo-labeled samples with confidence greater than 0.7: 332 / 787

Number of pseudo-labeled samples with confidence less than or equal to 0.5: 145 / 787


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
import nltk
import spacy

nltk.download('stopwords')
from nltk.corpus import stopwords

nlp = spacy.load('en_core_web_sm')

df_train = df.iloc[:170].copy()
df_unlabeled = df.iloc[170:].copy()

def preprocess_text(text):

    text = text.lower()

    text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    stop_words = set(stopwords.words('english'))
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc if token.text not in stop_words])

    return text

df_train['processed_text'] = df_train['text'].apply(preprocess_text)
df_unlabeled['processed_text'] = df_unlabeled['text'].apply(preprocess_text)

X = df_train['processed_text']
y = df_train['label']

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # 增加n-grams范围
X_tfidf = vectorizer.fit_transform(X)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

logreg = LogisticRegression(max_iter=1000)
svm = SVC(kernel='linear', random_state=42, probability=True)  # SVM with probability enabled

param_grid_logreg = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, n_jobs=-1)
grid_search_logreg.fit(X_resampled, y_resampled)
best_logreg = grid_search_logreg.best_estimator_

param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, n_jobs=-1)
grid_search_svm.fit(X_resampled, y_resampled)
best_svm = grid_search_svm.best_estimator_

voting_clf1 = VotingClassifier(estimators=[('logreg', best_logreg), ('svm', best_svm)], voting='soft')

voting_clf1.fit(X_resampled, y_resampled)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

val_preds, val_true = [], []

for train_idx, val_idx in skf.split(X_resampled, y_resampled):
    X_train, X_val = X_resampled[train_idx], X_resampled[val_idx]
    y_train, y_val = y_resampled.iloc[train_idx], y_resampled.iloc[val_idx]

    voting_clf1.fit(X_train, y_train)
    preds = voting_clf1.predict(X_val)

    val_preds.extend(preds)
    val_true.extend(y_val)

print(classification_report(val_true, val_preds, digits=4))

X_unlabeled_tfidf = vectorizer.transform(df_unlabeled['processed_text'])

probs = voting_clf1.predict_proba(X_unlabeled_tfidf)  # shape: [n_samples, n_classes]

confidences = probs.max(axis=1)

preds = voting_clf1.predict(X_unlabeled_tfidf)

df_unlabeled['pseudo_label3'] = preds
df_unlabeled['confidence3'] = confidences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

         0.0     0.7925    0.9231    0.8528        91
         1.0     0.9103    0.7802    0.8402        91
         2.0     1.0000    0.9780    0.9889        91

    accuracy                         0.8938       273
   macro avg     0.9009    0.8938    0.8940       273
weighted avg     0.9009    0.8938    0.8940       273



In [31]:
high_conf_df2 = df_unlabeled[df_unlabeled['confidence3'] >= 0.7]
low_conf_df2 = df_unlabeled[df_unlabeled['confidence3'] <= 0.5]
print(f"\nNumber of pseudo-labeled samples with confidence greater than 0.7: {len(high_conf_df2)} / {len(df_unlabeled)}")
print(f"\nNumber of pseudo-labeled samples with confidence less than or equal to 0.5: {len(low_conf_df2)} / {len(df_unlabeled)}")


Number of pseudo-labeled samples with confidence greater than 0.7: 261 / 787

Number of pseudo-labeled samples with confidence less than or equal to 0.5: 153 / 787
