In [1]:
from scipy.sparse import hstack, csr_matrix
from tqdm import tqdm
import numpy as np
import pandas as pd
import re

from collections import Counter
from collections import defaultdict
from scipy import sparse
from scipy.stats import uniform

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score

sns.set_style("whitegrid")

# Part 1: Representing Text Data

In [2]:
df_train = pd.read_csv("data/train.csv")
df_dev = pd.read_csv("data/dev.csv")
df_test = pd.read_csv("data/test.csv")

## Task 1.1: Tokenization

In [3]:
stop_words = {
    'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 
    'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 
    'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 
    'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 
    'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 
    'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 
    'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 
    'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 
    'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 
    'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 
    'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 
    'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 
    'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than',
    '<h>', '</h>'
}

stop_punctuations = '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~—'''

In [4]:
def tokenize(text):
    # Split the text by whitespace and return the list of tokens
    return text.split()

def better_tokenize(input_str, punctuations=stop_punctuations, stopwords=stop_words):
    # Remove line breaks and filter out <> enclosed text
    cleaned_str = re.sub(r'\n|\<[^<>]*\>', '', input_str)

    # Remove punctuations
    for punctuation in punctuations:
        cleaned_str = cleaned_str.replace(punctuation, '')

    # Convert to lowercase and split into words
    words = cleaned_str.lower().split()

    # Filter out stopwords
    filtered_words = filter(lambda word: word not in stopwords, words)

    return list(filtered_words)

In [5]:
input_str = "Your sample text goes here."
tokens = better_tokenize(input_str)
print(tokens)

['sample', 'text', 'goes']


## Task 1.2: Building the Term-Document Matrix

In [6]:
def build_term_freq_dict(dataframe, text_col, min_freq=2, tokenize_function=better_tokenize):
    term_counter = Counter()

    # Iterate through each document and update term frequencies
    for document in dataframe[text_col]:
        terms = tokenize_function(document)
        term_counter.update(terms)

    # Retain terms that meet the minimum frequency threshold
    filtered_terms = {term: freq for term, freq in term_counter.items() if freq >= min_freq}

    return filtered_terms

# Example usage
term_dictionary = build_term_freq_dict(df_train, 'text', 10)

In [7]:
# def createCompressedSparseRowMatrix(documents, vocab={}, frequencyFilter=term_dictionary):
#     rowStart = [0]
#     colIndices = []
#     values = []
# 
#     print("Constructing compressed sparse row (CSR) matrix...")
#     for document in tqdm(documents):
#         for word in document:
#             if word in frequencyFilter:  # Exclude terms with low frequency
#                 wordIndex = vocab[word] if word in vocab else len(vocab)
#                 vocab[word] = wordIndex
#                 colIndices.append(wordIndex)
#                 values.append(1)
#         rowStart.append(len(colIndices))
# 
#     csrMatrix = sparse.csr_matrix((values, colIndices, rowStart), dtype=int)
#     return csrMatrix, vocab

In [8]:
def createCompressedSparseRowMatrix(documents, vocab=None, frequencyFilter=term_dictionary):
    rowStart = [0]
    colIndices = []
    values = []
    new_vocab = {}
    if vocab is None:
        print("Constructing compressed sparse row (CSR) matrix...")
        for document in tqdm(documents):
            for word in document:
                if word in frequencyFilter:  # Exclude terms with low frequency
                    wordIndex = new_vocab[word] if word in new_vocab else len(new_vocab)
                    new_vocab[word] = wordIndex
                    colIndices.append(wordIndex)
                    values.append(1)
            rowStart.append(len(colIndices))

        csrMatrix = sparse.csr_matrix((values, colIndices, rowStart), dtype=int)
    else:
        new_vocab = vocab.copy()
        print("Constructing compressed sparse row (CSR) matrix...")
        for document in tqdm(documents):
            for word in document:
                if word in frequencyFilter and word in new_vocab:  # 仅处理存在于词汇表中的单词
                    wordIndex = new_vocab[word]
                    colIndices.append(wordIndex)
                    values.append(1)
            rowStart.append(len(colIndices))
            
        csrMatrix = sparse.csr_matrix((values, colIndices, rowStart), shape=(len(documents), len(new_vocab)), dtype=int)
    return csrMatrix, new_vocab

In [9]:
# Prepare the document list
document_list = []
print("Creating document list for training...")
for idx, record in tqdm(df_train.iterrows(), total=df_train.shape[0]):
    document_list.append(better_tokenize(record["text"]))

# Generate the CSR matrix
training_matrix, training_vocab = createCompressedSparseRowMatrix(document_list, vocab=None, frequencyFilter=term_dictionary)

# Add a bias column to the matrix
bias_column = np.ones((training_matrix.shape[0], 1))
training_matrix_with_bias = sparse.hstack([training_matrix, bias_column]).tocsr()

Creating document list for training...


100%|██████████| 7328/7328 [00:00<00:00, 18073.91it/s]


Constructing compressed sparse row (CSR) matrix...


100%|██████████| 7328/7328 [00:00<00:00, 215323.06it/s]


# Part 2: Logistic Regression in numpy

In [10]:
def sigmoid(X):
    # Using np.exp for element-wise exponential
    return np.divide(1, np.add(1, np.exp(np.negative(X))))

def log_likelihood(X, y, beta):
    sum = 0
    for i in range(X.shape[0]):  # 遍历样本
        x_i = X[i].toarray().flatten()
        sum += y[i] * np.dot(beta, x_i) - np.log10(1 + np.exp(np.dot(beta, x_i)))
    return sum

def compute_gradient(x, y, beta):
    # Using np.subtract and np.outer for vectorized operations
    prediction_error = np.subtract(sigmoid(np.dot(beta, x)), y)
    gradient = np.multiply(prediction_error, x)
    return gradient

In [11]:
def logistic_regression(X, y, learning_rate=5e-5, num_step=1000, is_plot=False):
    # Initialize variables
    vocab_size = X.shape[1]  # Size of the vocabulary
    beta_params = np.zeros(vocab_size)  # Initializing beta parameters
    prev_log_likelihood = log_likelihood(X, y, beta_params)  # Initial log-likelihood

    # Initialize plot-related lists if plotting is enabled
    steps, log_likelihoods = ([], []) if is_plot else (None, None)

    print("Running Logistic Regression...")
    for step in tqdm(range(num_step)):
        # Update beta using a gradient step
        current_index = step % vocab_size
        X_sample = X[current_index].toarray().ravel()
        y_sample = y[current_index]
        gradient = compute_gradient(X_sample, y_sample, beta_params)
        beta_params -= learning_rate * gradient

        # Record log-likelihood and step count at every 100th step
        if is_plot and step % 100 == 0:
            steps.append(step)
            log_likelihoods.append(log_likelihood(X, y, beta_params))

    # Return results based on whether plotting is enabled
    return (beta_params, steps, log_likelihoods) if is_plot else beta_params

In [12]:
def predict(text, model_coefficients, vocabulary=training_vocab):
    feature_vector = np.zeros(len(vocabulary) + 1)
    
    word_counts = Counter(better_tokenize(text))
    for word, count in word_counts.items():
        if word in vocabulary:
            feature_vector[vocabulary[word]] = count
    feature_vector[-1] = 1  # Add bias term

    prediction = sigmoid(np.dot(model_coefficients, feature_vector))
    return int(prediction >= 0.152)

## Task 2.1: Plot log-likelihood

In [13]:
# Mapping labels
label_map = {0: 0, 1: 1}
train_labels = np.array([label_map[label] for label in df_train['label']])

# Perform logistic regression
# regression_beta, iteration_steps, log_likelihoods = logistic_regression(
regression_beta = logistic_regression(
    X=training_matrix_with_bias,
    y=train_labels,
    num_step=10000,
    # is_plot=True
    is_plot=False
)


Running Logistic Regression...


100%|██████████| 10000/10000 [00:00<00:00, 14541.97it/s]


In [14]:
# # Plotting the relationship between steps and log-likelihood
# fig, plot_axis = plt.subplots(figsize=(10, 6))
# sns.lineplot(x=iteration_steps, y=log_likelihoods, ax=plot_axis)
# plot_axis.set_xlabel("Steps")
# plot_axis.set_ylabel("Log-likelihood (per 100 steps)")
# plot_axis.set_title("Loss v.s. Log-likelihood for Full Train Data")
# plt.show()

## Task 2.2: Make prediction on validation dataset

In [15]:
# train the beta
beta = logistic_regression(X = training_matrix_with_bias,
                           y = train_labels, 
                           learning_rate = 5e-5, 
                           num_step = 500000)

Running Logistic Regression...


100%|██████████| 500000/500000 [00:32<00:00, 15210.61it/s]


In [16]:
# make prediction
y_test = [label_map[p] for p in df_dev["label"]]
y_pred = []
print("Starting prediction on validation dataset...")
for i in tqdm(range(len(df_dev))):
    y_pred.append(predict(df_dev["text"][i], beta))

Starting prediction on validation dataset...


100%|██████████| 1047/1047 [00:00<00:00, 22029.00it/s]


In [17]:
f1_score(y_test, y_pred)

0.3626373626373627

## Task 2.3: Make prediction on test dataset

In [18]:
# Reverse label dictionary
label_reverse_map = {0: 0, 1: 1}

# Function to predict labels
def make_predictions(data):
    predictions = []
    print("Predicting labels for test dataset...")
    for index, row in tqdm(data.iterrows(), total=len(data)):
        text_content = str(row['text'])
        predictions.append(predict(text_content, beta))
    return predictions

# Generate predictions
predicted_labels = make_predictions(df_test)

# Create result dataframe and save to CSV
result_df = pd.DataFrame({
    'par_id': df_test['par_id'],
    'label': [label_reverse_map[label] for label in predicted_labels]
})
# result_df.to_csv("part2_result.csv", index=False)

Predicting labels for test dataset...


100%|██████████| 2094/2094 [00:00<00:00, 12673.39it/s]


# Part 3: Logistic Regression in PyTorch

In [19]:
def to_sparse_tensor(sparse_matrix: csr_matrix):
    sparse_matrix = sparse_matrix.tocoo()  # 将矩阵转换为COO格式
    indices = np.vstack((sparse_matrix.row, sparse_matrix.col))  # 获取坐标
    indices = torch.LongTensor(indices)  # 将坐标转换为torch张量
    values = torch.FloatTensor(sparse_matrix.data)  # 将值转换为torch张量
    shape = torch.Size(sparse_matrix.shape)  # 获取矩阵的形状
    return torch.sparse.FloatTensor(indices, values, shape)

In [20]:
class LogisticRegressionPyTorch(nn.Module):
    def __init__(self, input_features):
        super(LogisticRegressionPyTorch, self).__init__()
        self.linear = nn.Linear(input_features, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

In [21]:
dev_labels = np.array([label_map[label] for label in df_dev['label']])
# Prepare the document list
document_list = []
print("Creating document list for dev...")
for idx, record in tqdm(df_dev.iterrows(), total=df_dev.shape[0]):
    document_list.append(better_tokenize(record["text"]))

# Generate the CSR matrix
dev_matrix, dev_vocab = createCompressedSparseRowMatrix(document_list, vocab=training_vocab, frequencyFilter=term_dictionary)

# Add a bias column to the matrix
bias_column = np.ones((dev_matrix.shape[0], 1))
dev_matrix_with_bias = sparse.hstack([dev_matrix, bias_column]).tocsr()

# 使用to_sparse_tensor函数将稀疏矩阵转换为PyTorch的稀疏张量
X_train = to_sparse_tensor(training_matrix_with_bias)
X_dev = to_sparse_tensor(dev_matrix_with_bias)

# 接下来，将标签也转换为张量
y_train = torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1)
y_dev = torch.tensor(dev_labels, dtype=torch.float32).unsqueeze(1)

Creating document list for dev...


100%|██████████| 1047/1047 [00:00<00:00, 19366.44it/s]


Constructing compressed sparse row (CSR) matrix...


100%|██████████| 1047/1047 [00:00<00:00, 189621.15it/s]
  return torch.sparse.FloatTensor(indices, values, shape)


# 任务1：1000步训练并每20步报告损失

In [22]:
# 创建TensorDataset和DataLoader以进行批处理
dataset = TensorDataset(X_train, y_train)
batch_size = 64  # 或者您选择的其他批次大小
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 初始化模型
model = LogisticRegressionPyTorch(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

def train_model_1000_steps(model, dataloader, criterion, optimizer, total_steps=1000, report_interval=20):
    model.train()
    step = 0
    while step < total_steps:
        for inputs, labels in dataloader:
            if step >= total_steps:
                break
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if step % report_interval == 0:
                print(f'Step {step}, Loss: {loss.item()}')
            step += 1

# 调用函数训练模型
train_model_1000_steps(model, dataloader, criterion, optimizer)

Step 0, Loss: 0.6903110146522522
Step 20, Loss: 0.38507160544395447
Step 40, Loss: 0.33181700110435486
Step 60, Loss: 0.3208434581756592
Step 80, Loss: 0.4340411424636841
Step 100, Loss: 0.3424816429615021
Step 120, Loss: 0.23766645789146423
Step 140, Loss: 0.42486169934272766
Step 160, Loss: 0.170060932636261
Step 180, Loss: 0.4934781789779663
Step 200, Loss: 0.1974051594734192
Step 220, Loss: 0.1549217700958252
Step 240, Loss: 0.3302558362483978
Step 260, Loss: 0.2734345495700836
Step 280, Loss: 0.24036286771297455
Step 300, Loss: 0.2562035918235779
Step 320, Loss: 0.40244612097740173
Step 340, Loss: 0.19411596655845642
Step 360, Loss: 0.32827919721603394
Step 380, Loss: 0.3956741690635681
Step 400, Loss: 0.2102421671152115
Step 420, Loss: 0.2836611270904541
Step 440, Loss: 0.21319742500782013
Step 460, Loss: 0.2656530737876892
Step 480, Loss: 0.2881655693054199
Step 500, Loss: 0.3474210500717163
Step 520, Loss: 0.2889809310436249
Step 540, Loss: 0.22974801063537598
Step 560, Loss: 0

# 任务2：至少1个epoch的训练并计算损失和F1分数

In [23]:
def train_model_1_epoch(model, dataloader, criterion, optimizer, evaluation_interval=50):
    model.train()
    step = 0
    for epoch in range(1):  # 1个epoch
        for inputs, labels in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if step % evaluation_interval == 0:
                model.eval()
                with torch.no_grad():
                    dev_outputs = model(X_dev).squeeze()
                    dev_predictions = (dev_outputs >= 0.5).float()
                    f1 = f1_score(y_dev.numpy(), dev_predictions.numpy())
                    print(f'Step {step}, Loss: {loss.item()}, F1 Score: {f1:.4f}')
                model.train()
            step += 1

# 调用函数训练模型
train_model_1_epoch(model, dataloader, criterion, optimizer)

Step 0, Loss: 0.23298797011375427, F1 Score: 0.0404
Step 50, Loss: 0.30274245142936707, F1 Score: 0.0600
Step 100, Loss: 0.2189456671476364, F1 Score: 0.0600


# 任务3：添加正则化(L2惩罚)

In [24]:
l2_values = [0, 0.001, 0.1]
for l2 in l2_values:
    model = LogisticRegressionPyTorch(X_train.shape[1])
    optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=l2)
    print(f"Training with L2 penalty = {l2}")
    train_model_1_epoch(model, dataloader, criterion, optimizer)

Training with L2 penalty = 0
Step 0, Loss: 0.7011913061141968, F1 Score: 0.0336
Step 50, Loss: 0.32451537251472473, F1 Score: 0.0000
Step 100, Loss: 0.40147462487220764, F1 Score: 0.0000
Training with L2 penalty = 0.001
Step 0, Loss: 0.6821038722991943, F1 Score: 0.0600
Step 50, Loss: 0.31333431601524353, F1 Score: 0.0000
Step 100, Loss: 0.3402955234050751, F1 Score: 0.0000
Training with L2 penalty = 0.1
Step 0, Loss: 0.692952036857605, F1 Score: 0.0345
Step 50, Loss: 0.4544515907764435, F1 Score: 0.0000
Step 100, Loss: 0.2909027636051178, F1 Score: 0.0000


# 任务4：使用不同的优化器

In [25]:
optimizers = {
    'RMSprop': optim.RMSprop(model.parameters(), lr=0.01),
    'AdamW': optim.AdamW(model.parameters(), lr=0.01)
}

for opt_name, optimizer in optimizers.items():
    model = LogisticRegressionPyTorch(X_train.shape[1])
    print(f"Training with optimizer {opt_name}")
    train_model_1_epoch(model, dataloader, criterion, optimizer)

Training with optimizer RMSprop
Step 0, Loss: 0.7103024125099182, F1 Score: 0.1521
Step 50, Loss: 0.7111021876335144, F1 Score: 0.1521
Step 100, Loss: 0.708824872970581, F1 Score: 0.1521
Training with optimizer AdamW
Step 0, Loss: 0.6967037916183472, F1 Score: 0.1372
Step 50, Loss: 0.6896767020225525, F1 Score: 0.1372
Step 100, Loss: 0.687341570854187, F1 Score: 0.1372


# 任务5：不同的分词方法对比

In [27]:
def simple_tokenize(input_str):
    # Convert to lowercase and split into words based on whitespace
    return input_str.lower().split()

# Prepare the document list using simple_tokenize
simple_document_list = []
print("Creating document list for training with simple_tokenize...")
for idx, record in tqdm(df_train.iterrows(), total=df_train.shape[0]):
    simple_document_list.append(simple_tokenize(record["text"]))

# Generate the CSR matrix using simple_tokenize
simple_training_matrix, simple_training_vocab = createCompressedSparseRowMatrix(simple_document_list, vocab=training_vocab, frequencyFilter=term_dictionary)

# Add a bias column to the matrix
simple_bias_column = np.ones((simple_training_matrix.shape[0], 1))
simple_training_matrix_with_bias = sparse.hstack([simple_training_matrix, simple_bias_column]).tocsr()

# Convert the new matrix to a PyTorch sparse tensor
simple_X_train = to_sparse_tensor(simple_training_matrix_with_bias)

def train_and_evaluate(X_train, y_train, batch_size, model, criterion, optimizer, num_epochs=1, evaluation_interval=50):
    # 创建TensorDataset和DataLoader以进行批处理
    dataset = TensorDataset(X_train, y_train)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # 存储评估结果
    f1_scores = []

    # 训练模型
    step = 0
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 每隔一定步数进行评估
            if step % evaluation_interval == 0:
                model.eval()
                with torch.no_grad():
                    dev_outputs = model(X_dev).squeeze()
                    dev_predictions = (dev_outputs >= 0.5).float()
                    f1 = f1_score(y_dev.numpy(), dev_predictions.numpy())
                    f1_scores.append(f1)
                    print(f'Epoch {epoch}, Step {step}, Loss: {loss.item()}, F1 Score: {f1:.4f}')
                model.train()
            step += 1

    return f1_scores

# 使用better_tokenize的训练数据
f1_scores_better = train_and_evaluate(X_train, y_train, batch_size, model, criterion, optimizer)

# 使用simple_tokenize的训练数据
model_simple = LogisticRegressionPyTorch(simple_X_train.shape[1])  # 假设simple_X_train是简单分词的结果
optimizer_simple = optim.SGD(model_simple.parameters(), lr=0.1)  # 使用相同的学习率
f1_scores_simple = train_and_evaluate(simple_X_train, y_train, batch_size, model_simple, criterion, optimizer_simple)

Creating document list for training with simple_tokenize...


100%|██████████| 7328/7328 [00:00<00:00, 18719.26it/s]


Constructing compressed sparse row (CSR) matrix...


100%|██████████| 7328/7328 [00:00<00:00, 174289.96it/s]


Epoch 0, Step 0, Loss: 0.6953454613685608, F1 Score: 0.1372
Epoch 0, Step 50, Loss: 0.6914268732070923, F1 Score: 0.1372
Epoch 0, Step 100, Loss: 0.6905796527862549, F1 Score: 0.1372
Epoch 0, Step 0, Loss: 0.7059181928634644, F1 Score: 0.1242
Epoch 0, Step 50, Loss: 0.33203646540641785, F1 Score: 0.0000
Epoch 0, Step 100, Loss: 0.31209224462509155, F1 Score: 0.0000


# 任务6：学习率的影响

In [28]:
learning_rates = [0.001, 0.1, 1]
for lr in learning_rates:
    model = LogisticRegressionPyTorch(X_train.shape[1])
    optimizer = optim.SGD(model.parameters(), lr=lr)
    print(f"Training with learning rate {lr}")
    train_model_1_epoch(model, dataloader, criterion, optimizer)

Training with learning rate 0.001
Step 0, Loss: 0.6954245567321777, F1 Score: 0.1088
Step 50, Loss: 0.6852775812149048, F1 Score: 0.0912
Step 100, Loss: 0.6572492718696594, F1 Score: 0.0000
Training with learning rate 0.1
Step 0, Loss: 0.6867693066596985, F1 Score: 0.0200
Step 50, Loss: 0.3838656544685364, F1 Score: 0.0000
Step 100, Loss: 0.18682701885700226, F1 Score: 0.0000
Training with learning rate 1
Step 0, Loss: 0.6797922849655151, F1 Score: 0.0000
Step 50, Loss: 0.25589367747306824, F1 Score: 0.0000
Step 100, Loss: 0.2588452100753784, F1 Score: 0.0777
