In [126]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [127]:
# NLTK에서 불용어 리스트 다운로드
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

데이터 확인하기

In [128]:
df = pd.read_csv('/content/spam.csv')
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


불용어 처리, 특수 문자 제거 등 전처리 포함하여 tokenizing하기

In [129]:
#불용어 제거
stop_words = set(stopwords.words('english'))

# 문장 분리
def split_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# 문장부호 제거
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


In [130]:
df['v2'] = df['v2'].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.lower() not in stop_words]))
df['v2_sentences'] = df['v2'].apply(split_sentences)
df['v2_sentences'] = df['v2_sentences'].apply(lambda sentences: [remove_punctuation(sentence) for sentence in sentences])
df['v2'] = df['v2'].apply(remove_punctuation)

df.head()

Unnamed: 0,v1,v2,v2_sentences
0,ham,go jurong point crazy available bugis n grea...,[go jurong point crazy available bugis n gre...
1,ham,ok lar joking wif u oni,[ok lar joking wif u oni ]
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,[free entry 2 wkly comp win fa cup final tkts ...
3,ham,u dun say early hor u c already say,[u dun say early hor u c already say ]
4,ham,nah nt think goes usf lives around though,[nah nt think goes usf lives around though]


In [131]:
#토큰화
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

df['v2_tokens'] = df['v2'].apply(tokenize_text)
df.head()

Unnamed: 0,v1,v2,v2_sentences,v2_tokens
0,ham,go jurong point crazy available bugis n grea...,[go jurong point crazy available bugis n gre...,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,ok lar joking wif u oni,[ok lar joking wif u oni ],"[ok, lar, joking, wif, u, oni]"
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,[free entry 2 wkly comp win fa cup final tkts ...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,u dun say early hor u c already say,[u dun say early hor u c already say ],"[u, dun, say, early, hor, u, c, already, say]"
4,ham,nah nt think goes usf lives around though,[nah nt think goes usf lives around though],"[nah, nt, think, goes, usf, lives, around, tho..."


In [132]:
df['v1'] = df['v1'].apply(lambda x: 0 if x == 'ham' else 1)

df.head()

Unnamed: 0,v1,v2,v2_sentences,v2_tokens
0,0,go jurong point crazy available bugis n grea...,[go jurong point crazy available bugis n gre...,"[go, jurong, point, crazy, available, bugis, n..."
1,0,ok lar joking wif u oni,[ok lar joking wif u oni ],"[ok, lar, joking, wif, u, oni]"
2,1,free entry 2 wkly comp win fa cup final tkts 2...,[free entry 2 wkly comp win fa cup final tkts ...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,u dun say early hor u c already say,[u dun say early hor u c already say ],"[u, dun, say, early, hor, u, c, already, say]"
4,0,nah nt think goes usf lives around though,[nah nt think goes usf lives around though],"[nah, nt, think, goes, usf, lives, around, tho..."


One-hot encoding, Word2Vec, CBOW, Skip-gram, GloVe 등의 방법으로 임베딩하기

In [135]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import torch.nn.functional as F
import matplotlib.pyplot as plt
from transformers import BertTokenizer


# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 모델 정의
class CNNBiLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout):
        super(CNNBiLSTM, self).__init__()

        # CNN layer
        self.conv = nn.Conv1d(in_channels=1, out_channels=100, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=2)

        # BiLSTM layer
        self.lstm = nn.LSTM(100, hidden_dim, bidirectional=True, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # CNN layer
        x = x.unsqueeze(1)
        x = self.conv(x)
        x = F.relu(x)
        x = self.pool(x)

        # LSTM layer
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)

        # Global max pooling
        x = F.adaptive_max_pool1d(x.permute(0, 2, 1), 1).squeeze(2)

        # Fully connected layer
        x = self.fc(x)

        return x

corpus = " ".join([" ".join(tokens) for tokens in df['v2_tokens'].values])
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in nltk.sent_tokenize(corpus)]
word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# 텍스트를 숫자로 변환 (Word2Vec 임베딩을 사용)
def text_to_tensor(text, word2vec_model):
    word_vectors = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]

    if word_vectors:
        return torch.tensor(np.mean(word_vectors, axis=0))
    else:
        return torch.randn(100)

# 데이터셋 생성
def create_dataset(data, word2vec_model):
    input_tensors = []
    labels = []

    for index, row in tqdm(data.iterrows(), total=len(data)):
        combined_text = f"{row['v2_sentences']} {row['v2_tokens']}"
        label = row["v1"]

        input_tensor = text_to_tensor(combined_text, word2vec_model)
        input_tensors.append(input_tensor)
        labels.append(label)

    input_tensors = torch.stack(input_tensors)
    labels = torch.tensor(labels)

    return TensorDataset(input_tensors, labels)

# 클래스 가중치 계산 함수
def compute_class_weights(labels):
    class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
    return torch.tensor(class_weights, dtype=torch.float32)

# Define cross-validation
num_folds = 3
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Store metrics for each fold
fold_accuracies = []
fold_precisions = []
fold_recalls = []
fold_f1_scores = []
fold_f2_scores = []

# Hyperparameters
embedding_dim = 100
hidden_dim = 64
output_dim = 2
dropout = 0.5
lr = 0.1

# Epoch values for hyperparameter tuning
epoch_values = [3]
validation_accuracies = []

# Threshold 조정 및 클래스 가중치 조정
threshold = 0.6  # Threshold 조정 (조절 가능)
class_weight_factor = 2  # 클래스 가중치 조정 (조절 가능)

# Iterate over different epoch values
for num_epochs in epoch_values:
    print(f"Number of Epochs: {num_epochs}")

    # Iterate over folds
    for fold, (train_index, val_index) in enumerate(skf.split(df['v2_tokens'], df['v1'])):
        print(f'Fold {fold + 1}/{num_folds}')

        train_df = df.iloc[train_index]
        val_df = df.iloc[val_index]

        train_dataset = create_dataset(train_df, word2vec_model)
        val_dataset = create_dataset(val_df, word2vec_model)

        batch_size = 8
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        model = CNNBiLSTM(embedding_dim, hidden_dim, output_dim, dropout)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        class_weights = compute_class_weights(train_df['v1']).to(device)

        criterion = nn.CrossEntropyLoss(weight=class_weights)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            for inputs, labels in tqdm(train_loader):
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            average_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")

        model.eval()
        val_predictions = []
        val_true_labels = []

        with torch.no_grad():
            for inputs, labels in tqdm(val_loader):
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                probabilities = F.softmax(outputs, dim=1)[:, 1]  # Positive 클래스의 확률
                predicted_labels = (probabilities > threshold).long()  # Threshold 조정
                val_predictions.extend(predicted_labels.cpu().numpy())
                val_true_labels.extend(labels.cpu().numpy())

        val_accuracy = accuracy_score(val_true_labels, val_predictions)
        val_precision = precision_score(val_true_labels, val_predictions)
        val_recall = recall_score(val_true_labels, val_predictions)

        fold_accuracies.append(val_accuracy)

        print(f"Validation Accuracy: {val_accuracy}")

    average_accuracy = sum(fold_accuracies) / num_folds

    validation_accuracies.append(average_accuracy)



Number of Epochs: 3
Fold 1/3


100%|██████████| 3714/3714 [00:02<00:00, 1304.58it/s]
100%|██████████| 1858/1858 [00:01<00:00, 1313.57it/s]
100%|██████████| 465/465 [00:12<00:00, 38.02it/s]


Epoch 1, Average Loss: 0.8590849813320474


100%|██████████| 465/465 [00:10<00:00, 43.21it/s]


Epoch 2, Average Loss: 0.829824242812972


100%|██████████| 465/465 [00:11<00:00, 42.17it/s]


Epoch 3, Average Loss: 0.8248052030962001


100%|██████████| 233/233 [00:01<00:00, 192.18it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.8659849300322928
Fold 2/3


100%|██████████| 3715/3715 [00:01<00:00, 2327.25it/s]
100%|██████████| 1857/1857 [00:00<00:00, 2341.13it/s]
100%|██████████| 465/465 [01:09<00:00,  6.64it/s]


Epoch 1, Average Loss: 1.0449012789200522


100%|██████████| 465/465 [01:02<00:00,  7.41it/s]


Epoch 2, Average Loss: 1.021842248297061


100%|██████████| 465/465 [00:55<00:00,  8.39it/s]


Epoch 3, Average Loss: 1.1095260021016402


100%|██████████| 233/233 [00:04<00:00, 52.25it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.8659127625201939
Fold 3/3


100%|██████████| 3715/3715 [00:01<00:00, 2133.80it/s]
100%|██████████| 1857/1857 [00:00<00:00, 2352.58it/s]
100%|██████████| 465/465 [00:35<00:00, 13.11it/s]


Epoch 1, Average Loss: 1.3012294703064786


100%|██████████| 465/465 [00:43<00:00, 10.77it/s]


Epoch 2, Average Loss: 1.167793506354798


100%|██████████| 465/465 [00:42<00:00, 10.82it/s]


Epoch 3, Average Loss: 1.3450192503150433


100%|██████████| 233/233 [00:01<00:00, 132.05it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.8659127625201939
