# Reddit 댓글 규칙 위반 여부 분류

## 1. Import Library

In [1]:
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split

## 2. Load Data

In [2]:
# 데이터 로드
train_df = pd.read_csv('../data/jigsaw_agile/train.csv')
test_df = pd.read_csv('../data/jigsaw_agile/test.csv')

# 데이터 크기 확인
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (2029, 9)
Test shape: (10, 8)


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2029 entries, 0 to 2028
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              2029 non-null   int64 
 1   body                2029 non-null   object
 2   rule                2029 non-null   object
 3   subreddit           2029 non-null   object
 4   positive_example_1  2029 non-null   object
 5   positive_example_2  2029 non-null   object
 6   negative_example_1  2029 non-null   object
 7   negative_example_2  2029 non-null   object
 8   rule_violation      2029 non-null   int64 
dtypes: int64(2), object(7)
memory usage: 142.8+ KB


In [4]:
train_df.head()

Unnamed: 0,row_id,body,rule,subreddit,positive_example_1,positive_example_2,negative_example_1,negative_example_2,rule_violation
0,0,Banks don't want you to know this! Click here ...,"No Advertising: Spam, referral links, unsolici...",Futurology,If you could tell your younger self something ...,hunt for lady for jack off in neighbourhood ht...,Watch Golden Globe Awards 2017 Live Online in ...,"DOUBLE CEE x BANDS EPPS - ""BIRDS""\n\nDOWNLOAD/...",0
1,1,SD Stream [ ENG Link 1] (http://www.sportsstre...,"No Advertising: Spam, referral links, unsolici...",soccerstreams,[I wanna kiss you all over! Stunning!](http://...,LOLGA.COM is One of the First Professional Onl...,#Rapper \n🚨Straight Outta Cross Keys SC 🚨YouTu...,[15 Amazing Hidden Features Of Google Search Y...,0
2,2,Lol. Try appealing the ban and say you won't d...,No legal advice: Do not offer or request legal...,pcmasterrace,Don't break up with him or call the cops. If ...,It'll be dismissed: https://en.wikipedia.org/w...,Where is there a site that still works where y...,Because this statement of his is true. It isn'...,1
3,3,she will come your home open her legs with an...,"No Advertising: Spam, referral links, unsolici...",sex,Selling Tyrande codes for 3€ to paypal. PM. \n...,tight pussy watch for your cock get her at thi...,NSFW(obviously) http://spankbang.com/iy3u/vide...,Good News ::Download WhatsApp 2.16.230 APK for...,1
4,4,code free tyrande --->>> [Imgur](http://i.imgu...,"No Advertising: Spam, referral links, unsolici...",hearthstone,wow!! amazing reminds me of the old days.Well...,seek for lady for sex in around http://p77.pl/...,must be watch movie https://sites.google.com/s...,We're streaming Pokemon Veitnamese Crystal RIG...,1


In [5]:
# 클래스 불균형 확인
train_df["rule_violation"].value_counts()

rule_violation
1    1031
0     998
Name: count, dtype: int64

## 3. Data Preprocessing

In [6]:
def preprocess_text(text):
    # 소문자로 통일
    text = str(text).lower()

    # url이 포함된 경우 [URL] 토큰으로 치환
    text = re.sub(r'http\S+|www\S+', '[URL]', text)

    # 특수문자 제거
    text = re.sub(r'[^\w\s[\]]', '', text)

    # 공백 제거
    text = ' '.join(text.split())
    return text

In [7]:
body_processed = train_df["body"].apply(preprocess_text)
rule_processed = train_df["rule"].apply(preprocess_text)

# 타겟 컬럼 분리
X = body_processed + " [RULE] " + rule_processed
y = train_df["rule_violation"]

# 테스트 데이터에도 동일한 전처리 적용
body_processed_test = test_df["body"].apply(preprocess_text)
rule_processed_test = test_df["rule"].apply(preprocess_text)

X_test = body_processed_test + " [RULE] " + rule_processed_test

# 전처리된 샘플 데이터를 확인합니다.
print(f"Processed data sample: {X.iloc[0][:100]}")

Processed data sample: banks dont want you to know this click here to know more [RULE] no advertising spam referral links u


In [8]:
# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 모델 생성

In [9]:
def compare_models(model_names, X_test, y_test, sample_size=200):
    device = 0 if torch.cuda.is_available() else -1
    
    results = {}
    for model_name in model_names:
        print(f"Model: {model_name}")

        try:
            pipe = pipeline('text-classification', model=model_name, return_all_scores=True)

            predictions = []
            for text in X_test[:sample_size]:
                result = pipe(text, truncation=True)
                label = 1 if result[0]['label'] in ['TOXIC', 'LABEL_1'] else 0
                predictions.append(label)

            metrics = {
                "auccuracy": accuracy_score(y_test[:sample_size], predictions),
                "f1": f1_score(y_test[:sample_size], predictions)
            }
            results[model_name] = metrics

        except Exception as e:
            print(f"Error with {model_name}")

    return pd.DataFrame(results).T

In [10]:
model_names = [
    'unitary/toxic-bert',
    'distilroberta-base',
    'bert-base-uncased',
    'distilbert-base-uncased',
]

comparison = compare_models(model_names, X_val, y_val, sample_size=200)
print(comparison)

Model: unitary/toxic-bert


Device set to use mps:0


Error with unitary/toxic-bert
Model: distilroberta-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


Error with distilroberta-base
Model: bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


Error with bert-base-uncased
Model: distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


Error with distilbert-base-uncased
Empty DataFrame
Columns: []
Index: []


## 토큰화

In [11]:
def finetune_model(model_name, X_train, y_train, X_val, y_val, epochs=3):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    )

    tok_tr = tokenizer(list(X_train), truncation=True, max_length=256)
    tok_val = tokenizer(list(X_val), truncation=True, max_length=256)
    
    tok_tr["labels"] = list(map(int, y_train))
    tok_val["labels"] = list(map(int, y_val))
    
    ds_tr = Dataset.from_dict(tok_tr)
    ds_val = Dataset.from_dict(tok_val)

    collator = DataCollatorWithPadding(tokenizer=tok)

    args = TrainingArguments(
        output_dir=f'./results/{model_name.replace("/", "_")}',
        num_train_epochs=epochs,
        per_device_train_batch_size=16, 
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        evaluation_strategy='epoch', 
        save_strategy='epoch',   
        load_best_model_at_end=True,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
        seed=42,
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tr,
        eval_dataset=ds_val,  
        tokenizer=tokenizer,       
    )
    
    trainer.train()
    
    return trainer

In [12]:
for model_name in model_names:
    finetune_model(model_name, X_train, y_train, X_val, y_val)

RuntimeError: Error(s) in loading state_dict for Linear:
	size mismatch for bias: copying a param with shape torch.Size([6]) from checkpoint, the shape in current model is torch.Size([2]).