In [34]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [35]:
!cd /kaggle/input/word2vec-nlp-tutorial
!ls

In [36]:
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# 加载压缩数据
def load_zipped_data(zip_path, file_name):
    with zipfile.ZipFile(zip_path) as z:
        with z.open(file_name) as f:
            return pd.read_csv(f, sep='\t', encoding='utf-8')

train_data = load_zipped_data("/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip", "labeledTrainData.tsv")
test_data = load_zipped_data("/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip", "testData.tsv")

print(f"Shap of train: {train_data.shape}\nexample:\n{train_data.head(2)}")
print(f"\nShap of test: {test_data.shape}\nexample:\n{test_data.head(2)}")

train_data['review'] = train_data['review'].str.replace('<br />', ' ')
test_data['review'] = test_data['review'].str.replace('<br />', ' ')

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['review'].values,
    train_data['sentiment'].values,
    test_size=0.2,
    random_state=42,
    stratify=train_data['sentiment'].values 
)

Shap of train: (25000, 3)
example:
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...

Shap of test: (25000, 2)
example:
         id                                             review
0  12311_10  Naturally in a film who's main themes are of m...
1    8348_2  This movie is a disaster within a disaster fil...


In [37]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from datasets import load_dataset

## Model

In [None]:
class BERT_MLP(nn.Module):
    def __init__(self, bert_model, hidden_size=512, num_classes=2):
        super(BERT_MLP, self).__init__()
        self.bert = bert_model
        
        for param in self.bert.parameters():
            param.requires_grad = False 
            
        for layer in self.bert.encoder.layer[-4:]:
            for param in layer.parameters():
                param.requires_grad = True
        
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  # (batch, seq_len, 768)
        pooled = torch.mean(last_hidden, dim=1)  # (batch, 768)
        
        x = self.fc1(pooled)
        x = self.relu(x)
        x = self.dropout(x) 
        x = self.fc2(x)
        return x

In [39]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [40]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['review'].values, train_data['sentiment'].values, test_size=0.2, random_state=42
)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [41]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BERT_MLP(bert_model).to(device)

max_len = 256  
optimizer = AdamW([
    {'params': model.bert.parameters(), 'lr': 2e-5},  
    {'params': model.fc1.parameters(), 'lr': 1e-4},   
    {'params': model.fc2.parameters(), 'lr': 1e-4}
])
train_dataset = ReviewDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = ReviewDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


In [None]:

from tqdm import tqdm

def train_epoch(model, data_loader, optimizer, criterion):
    model = model.train()
    losses = []
    correct_predictions = 0

    loop = tqdm(data_loader, desc="Training", leave=False)

    for batch in loop:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
       

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        losses.append(loss.item())
        
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)
        
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item())

    return losses, correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(losses)

def eval_model(model, data_loader, criterion):
    model = model.eval()
    losses = []
    correct_predictions = 0

    loop = tqdm(data_loader, desc="Evaluating", leave=False)

    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].squeeze(1).to(device)
            attention_mask = batch['attention_mask'].squeeze(1).to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            losses.append(loss.item())
            
            # 预测
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            loop.set_postfix(loss=loss.item())

    return losses, correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(losses)

In [43]:



criterion = nn.CrossEntropyLoss()

num_epochs = 1


for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    train_losses, train_acc, train_loss = train_epoch(model, train_loader, optimizer, criterion)
    
    val_losses, val_acc, val_loss = eval_model(model, val_loader, criterion)

Epoch 1/1


                                                                        

In [None]:

model.eval()



batch_size = 128
all_preds = []

with torch.no_grad():
    for i in tqdm(range(0, len(test_data), batch_size)):
        batch_reviews = test_data['review'].iloc[i:i+batch_size].tolist()
        
        # Tokenize
        encoded = tokenizer(batch_reviews, padding=True, truncation=True, max_length=512, return_tensors="pt")
        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)
        
        # Forward
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        
        all_preds.extend(preds)

# 保存到csv
submission = pd.DataFrame({
    "id": test_data["id"],
    "sentiment": all_preds
})
submission.to_csv("/kaggle/working/mlp.csv", index=False, quoting=3)


100%|██████████| 196/196 [08:06<00:00,  2.48s/it]
