## 准备数据

In [1]:
import urllib.request as request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url,zip_path,extracted_path,data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return
    
    #下载
    with request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    #解压
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

sms_spam_collection\SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [2]:
import pandas as pd
df = pd.read_csv(data_file_path,sep="\t",header=None,names = ["Label","Text"])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
print(df["Label"].value_counts())

ham     4825
spam     747
Name: Label, dtype: int64


In [4]:
def create_balanced_dataset(df):
    num_spam = df[df["Label"] == 'spam'].shape[0]
    ham_subset = df[df['Label'] == 'ham'].sample(num_spam,random_state =123)

    balanced_df = pd.concat([ham_subset, df[df['Label'] == 'spam']])

    return balanced_df

balanced_df =create_balanced_dataset(df)
print(balanced_df['Label'].value_counts())

ham     747
spam    747
Name: Label, dtype: int64


In [5]:
balanced_df['Label'] = balanced_df["Label"].map({"ham":0, "spam":1})
balanced_df

Unnamed: 0,Label,Text
4307,0,Awww dat is sweet! We can think of something t...
4138,0,Just got to &lt;#&gt;
4831,0,"The word ""Checkmate"" in chess comes from the P..."
4461,0,This is wishing you a great day. Moji told me ...
5440,0,Thank you. do you generally date the brothas?
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [6]:
def random_split(df,train_frac,validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df,validation_df,test_df = random_split(balanced_df,0.7,0.1)
train_df.to_csv("train.csv",index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [7]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>",allowed_special={"<|endoftext|>"}))

[50256]


In [8]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self,csv_file, tokenizer,max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)

        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data['Text']
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length =max_length
            #truncate
            self.encoded_texts = [
                encoded_text[:self.max_length] for encoded_text in self.encoded_texts
            ]
        self.encoded_texts = [
            encoded_text +[pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self,index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]['Label']
        return(
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )
    
    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            max_length = max(encoded_length, max_length)
        return max_length

In [9]:
train_dataset = SpamDataset(
    csv_file = "train.csv",
    max_length =None,
    tokenizer = tokenizer
)

print(train_dataset.max_length)

120


In [10]:
val_dataset = SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [11]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [12]:
print("Train loader")
for input_batch, target_batch in train_loader:
    pass

print('Input batch dimensions:', input_batch.shape)
print('Label batch dimensions:', target_batch.shape)

Train loader
Input batch dimensions: torch.Size([8, 120])
Label batch dimensions: torch.Size([8])


In [13]:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

130 training batches
19 validation batches
38 test batches


In [14]:
pip install transformers -U

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting tokenizers<0.21,>=0.20


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.23 requires bcrypt>=4.0.1, which is not installed.
chromadb 0.5.23 requires opentelemetry-api>=1.2.0, which is not installed.
chromadb 0.5.23 requires tenacity>=8.2.3, which is not installed.



  Downloading https://mirrors.aliyun.com/pypi/packages/67/40/bd86347e7178a489476a922f004b396335d4f7ceab40ef01dbbf47dbae64/tokenizers-0.20.3-cp38-none-win_amd64.whl (2.4 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
Successfully installed tokenizers-0.20.3


In [15]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification

# 选择模型名称（与Hugging Face Hub一致）
model_name = "gpt2"  # 可替换为 "gpt2-medium"等

# 加载分词器
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token  # 设置填充token
# 加载分类模型（自动添加分类头）
# model = GPT2ForSequenceClassification.from_pretrained(
#     model_name,
#     num_labels=2,  # 分类类别数（示例为二分类）
#     pad_token_id=tokenizer.eos_token_id
# )


ImportError: tokenizers>=0.20,<0.21 is required for a normal functioning of this module, but found tokenizers==0.19.0.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main

In [None]:
pip install tokenizers==0.19.0

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting tokenizers==0.19.0
  Downloading https://mirrors.aliyun.com/pypi/packages/4f/aa/62429287c247b3ca0c627f356505c31734e895662943ced4daba84a1bfbf/tokenizers-0.19.0-cp38-none-win_amd64.whl (2.2 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
Successfully installed tokenizers-0.19.1
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.23 requires bcrypt>=4.0.1, which is not installed.
chromadb 0.5.23 requires opentelemetry-api>=1.2.0, which is not installed.
chromadb 0.5.23 requires tenacity>=8.2.3, which is not installed.
transformers 4.46.3 requires tokenizers<0.21,>=0.20, but you have tokenizers 0.19.0 which is incompatible.
