# Reddit 댓글 규칙 위반 여부 분류

## 1. Import Library

In [1]:
import os

In [2]:
SSD_PATH = '/Volumes/PortableSSD'
PROJECT_PATH = f'{SSD_PATH}/Projects/kaggle-project'
RESULTS_PATH = f'{SSD_PATH}/Projects/results'
MODELS_PATH = f'{SSD_PATH}/Projects/models'

In [3]:
os.makedirs(RESULTS_PATH, exist_ok=True)
os.makedirs(MODELS_PATH, exist_ok=True)

In [4]:
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## 2. Load Data

In [6]:
# 데이터 로드
train_df = pd.read_csv('../data/jigsaw_agile/train.csv')
test_df = pd.read_csv('../data/jigsaw_agile/test.csv')

# 데이터 크기 확인
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (2029, 9)
Test shape: (10, 8)


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2029 entries, 0 to 2028
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              2029 non-null   int64 
 1   body                2029 non-null   object
 2   rule                2029 non-null   object
 3   subreddit           2029 non-null   object
 4   positive_example_1  2029 non-null   object
 5   positive_example_2  2029 non-null   object
 6   negative_example_1  2029 non-null   object
 7   negative_example_2  2029 non-null   object
 8   rule_violation      2029 non-null   int64 
dtypes: int64(2), object(7)
memory usage: 142.8+ KB


In [8]:
train_df.head()

Unnamed: 0,row_id,body,rule,subreddit,positive_example_1,positive_example_2,negative_example_1,negative_example_2,rule_violation
0,0,Banks don't want you to know this! Click here ...,"No Advertising: Spam, referral links, unsolici...",Futurology,If you could tell your younger self something ...,hunt for lady for jack off in neighbourhood ht...,Watch Golden Globe Awards 2017 Live Online in ...,"DOUBLE CEE x BANDS EPPS - ""BIRDS""\n\nDOWNLOAD/...",0
1,1,SD Stream [ ENG Link 1] (http://www.sportsstre...,"No Advertising: Spam, referral links, unsolici...",soccerstreams,[I wanna kiss you all over! Stunning!](http://...,LOLGA.COM is One of the First Professional Onl...,#Rapper \n🚨Straight Outta Cross Keys SC 🚨YouTu...,[15 Amazing Hidden Features Of Google Search Y...,0
2,2,Lol. Try appealing the ban and say you won't d...,No legal advice: Do not offer or request legal...,pcmasterrace,Don't break up with him or call the cops. If ...,It'll be dismissed: https://en.wikipedia.org/w...,Where is there a site that still works where y...,Because this statement of his is true. It isn'...,1
3,3,she will come your home open her legs with an...,"No Advertising: Spam, referral links, unsolici...",sex,Selling Tyrande codes for 3€ to paypal. PM. \n...,tight pussy watch for your cock get her at thi...,NSFW(obviously) http://spankbang.com/iy3u/vide...,Good News ::Download WhatsApp 2.16.230 APK for...,1
4,4,code free tyrande --->>> [Imgur](http://i.imgu...,"No Advertising: Spam, referral links, unsolici...",hearthstone,wow!! amazing reminds me of the old days.Well...,seek for lady for sex in around http://p77.pl/...,must be watch movie https://sites.google.com/s...,We're streaming Pokemon Veitnamese Crystal RIG...,1


In [9]:
# 클래스 불균형 확인
train_df["rule_violation"].value_counts()

rule_violation
1    1031
0     998
Name: count, dtype: int64

## 3. Data Preprocessing

In [10]:
def preprocess_text(text):
    # 소문자로 통일
    text = str(text).lower()

    # url이 포함된 경우 [URL] 토큰으로 치환
    text = re.sub(r'http\S+|www\S+', '[URL]', text)

    # 특수문자 제거
    text = re.sub(r'[^\w\s[\]]', '', text)

    # 공백 제거
    text = ' '.join(text.split())
    return text

In [11]:
body_processed = train_df["body"].apply(preprocess_text)
rule_processed = train_df["rule"].apply(preprocess_text)

# 타겟 컬럼 분리
X = body_processed + " [RULE] " + rule_processed
y = train_df["rule_violation"]

# 테스트 데이터에도 동일한 전처리 적용
body_processed_test = test_df["body"].apply(preprocess_text)
rule_processed_test = test_df["rule"].apply(preprocess_text)

X_test = body_processed_test + " [RULE] " + rule_processed_test

# 전처리된 샘플 데이터를 확인합니다.
print(f"Processed data sample: {X.iloc[0][:100]}")

Processed data sample: banks dont want you to know this click here to know more [RULE] no advertising spam referral links u


In [12]:
# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [13]:
print(X)

0       banks dont want you to know this click here to...
1       sd stream [ eng link 1] [URL] [RULE] no advert...
2       lol try appealing the ban and say you wont do ...
3       she will come your home open her legs with and...
4       code free tyrande [imgur][URL] for you and you...
                              ...                        
2024    please edit your post so it is readable these ...
2025    yes and in a right to work state they can even...
2026    hd streams eng hd[ watch herepc mobile ][URL] ...
2027    no not when doing so obviously presents a safe...
2028    ca is an at fault state so they will not be ab...
Length: 2029, dtype: object


## 성능 평가

In [14]:
# TF-IDF 시행
tfidf = TfidfVectorizer(
    max_features=2000,
    min_df=2,
    max_df=0.8,
    ngram_range=(1,3),
    sublinear_tf=True,
    use_idf=True
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

print(f'TF-IDF 벡터 shape:')
print(f'  Train: {X_train_tfidf.shape}')
print(f'  Test: {X_val_tfidf.shape}')

TF-IDF 벡터 shape:
  Train: (1623, 2000)
  Test: (406, 2000)


In [15]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

models_sklearn = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42, n_jobs=-1),
    "LinearSVC": LinearSVC(class_weight="balanced"),
    "SGD-Log": SGDClassifier(loss="log_loss", class_weight="balanced", max_iter=2000, random_state=42, n_jobs=-1),
}

for name, sk_model in models_sklearn.items():
    sk_model.fit(X_train_tfidf, y_train)
    y_tr_pred = sk_model.predict(X_train_tfidf)
    y_val_pred = sk_model.predict(X_val_tfidf)

    train_accuracy = accuracy_score(y_train, y_tr_pred)
    test_accuracy = accuracy_score(y_val, y_val_pred)
    test_f1 = f1_score(y_val, y_val_pred)

    print(f"Model: {name}")
    print(f"Train_accuracy: {train_accuracy:.3f}")
    print(f"Val_accuracy: {test_accuracy:.3f}")
    print(f"Val_f1_score: {test_f1:.3f}")
    print()

Model: Logistic Regression
Train_accuracy: 0.848
Val_accuracy: 0.751
Val_f1_score: 0.767

Model: LinearSVC
Train_accuracy: 0.954
Val_accuracy: 0.754
Val_f1_score: 0.756

Model: SGD-Log
Train_accuracy: 0.942
Val_accuracy: 0.754
Val_f1_score: 0.760



## 토큰화

사전학습 모델에 적용시키기 위해 데이터를 토큰화하는 작업을 진행합니다.

In [16]:
def finetune_model(model_name, X_train, y_train, X_val, y_val, epochs=1):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label={0:"not_violation", 1:"violation"},
        label2id={"not_violation":0, "violation":1}
    )

    tok_tr = tokenizer(list(X_train), truncation=True, max_length=128)
    tok_val = tokenizer(list(X_val), truncation=True, max_length=128)
    
    tok_tr["labels"] = y_train.tolist()
    tok_val["labels"] = y_val.tolist()
    
    ds_tr = Dataset.from_dict(tok_tr)
    ds_val = Dataset.from_dict(tok_val)

    collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def metrics(p):
        logits, labels = p
        probs = torch.softmax(torch.tensor(logits), dim=-1)[:,1].numpy()
        preds = (probs >= 0.5).astype(int)
        return {
            "accuracy":  accuracy_score(labels, preds),
            "f1":        f1_score(labels, preds),
            "roc_auc":   roc_auc_score(labels, probs),
            "pr_auc":    average_precision_score(labels, probs),
        }

    args = TrainingArguments(
        output_dir=f'{RESULTS_PATH}/{model_name.replace("/", "_")}',
        num_train_epochs=epochs,
        per_device_train_batch_size=8, 
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=2,
        learning_rate=3e-5,
        eval_strategy='epoch', 
        save_strategy='epoch',   
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        weight_decay=0.01,
        fp16=False,
        logging_steps=100,
        seed=42,
        use_cpu=True,
        no_cuda=True,
        report_to='none',
        disable_tqdm=False,
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tr,
        eval_dataset=ds_val,  
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=metrics
    )
    
    trainer.train()

    pred = trainer.predict(ds_val)
    probs = torch.softmax(torch.tensor(pred.predictions), dim=-1)[:,1].numpy()
    preds = (probs >= 0.5).astype(int)
    
    return trainer, tokenizer, ds_val, probs, preds

In [17]:
# 사전학습 모델로 성능을 테스트합니다.
transformer_model_ids = [
    "distilroberta-base",
    "bert-base-uncased",
    "distilbert-base-uncased",
]

results = {}

for tf_model in transformer_model_ids:
    trainer, tokenizer, ds_val, probs, preds = finetune_model(
    tf_model, X_train, y_train, X_val, y_val, epochs=3
)
    results[tf_model] = {
        'f1': f1_score(y_val, preds),
        'accuracy': accuracy_score(y_val, preds),
        'roc_auc': roc_auc_score(y_val, preds)
    }
    
    print(f"Model: {tf_model}")
    print(f" - F1: {results[tf_model]['f1']:.4f}")
    print(f" - Accuracy: {results[tf_model]['accuracy']:.4f}")
    print(f" - ROC-AUC: {results[tf_model]['roc_auc']:.4f}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc,Pr Auc
1,0.6363,0.657542,0.674877,0.597561,0.820485,0.818123
2,0.4578,0.426182,0.802956,0.811321,0.888932,0.896025
3,0.3567,0.446559,0.807882,0.814286,0.889102,0.894935


Model: distilroberta-base
 - F1: 0.8143
 - Accuracy: 0.8079
 - ROC-AUC: 0.8075


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc,Pr Auc
1,0.6543,0.566654,0.694581,0.740586,0.806408,0.809173
2,0.5235,0.478174,0.778325,0.801762,0.866238,0.870502
3,0.3804,0.450988,0.800493,0.796992,0.875534,0.881406


Model: bert-base-uncased
 - F1: 0.8018
 - Accuracy: 0.7783
 - ROC-AUC: 0.7767


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc,Pr Auc
1,0.6411,0.546046,0.719212,0.761506,0.803835,0.795592
2,0.4724,0.481119,0.761084,0.758105,0.853131,0.844643
3,0.3394,0.497732,0.756158,0.759124,0.854709,0.850061


Model: distilbert-base-uncased
 - F1: 0.7615
 - Accuracy: 0.7192
 - ROC-AUC: 0.7167


## 테스트 데이터셋 확률 예측

## 제출