In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
from torch.optim import AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re

Model 1 - BERT-base-uncased

In [48]:
data = pd.read_csv("/Users/annikalu/Desktop/Capstone_Project/Winter/df_2023 - df_2023.csv") 
data.dropna(subset=['text'], inplace=True) 
data['text'] = data['text'].astype(str) 

In [50]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.strip()
    return text

data['text'] = data['text'].apply(clean_text)

sentiment_mapping = {"positive": 0, "negative": 1, "neutral": 2}
data['sentiment'] = data['sentiment'].map(sentiment_mapping)

train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['sentiment'], test_size=0.1, random_state=42)


In [51]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)

In [52]:
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), 
                              torch.tensor(train_encodings['attention_mask']), 
                              torch.tensor(train_labels.tolist()))  

val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']), 
                            torch.tensor(val_encodings['attention_mask']), 
                            torch.tensor(val_labels.tolist()))  


train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader = DataLoader(val_dataset, batch_size=8)


In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [54]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [55]:
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

In [56]:
epochs = 4
total_steps = len(train_loader) * epochs

In [57]:
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)


In [58]:
early_stopping_patience = 2
early_stopping_counter = 0
best_val_accuracy = 0

In [59]:
for epoch in range(epochs):

    model.train()
    total_train_loss = 0
    
    for batch in train_loader:
        batch = [item.to(device) for item in batch]
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss}")

    model.eval()
    total_eval_accuracy = 0
    
    for batch in val_loader:
        batch = [item.to(device) for item in batch]
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (predictions == inputs['labels']).cpu().numpy().mean()
    
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Accuracy: {avg_val_accuracy}")

    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        early_stopping_counter = 0
        torch.save(model.state_dict(), 'best_model_state.bin')
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

print("Training complete!")


Epoch 1, Average Training Loss: 0.7309195413941243
Epoch 1, Validation Accuracy: 0.7267857142857144
Epoch 2, Average Training Loss: 0.6294722505768792
Epoch 2, Validation Accuracy: 0.7660714285714285
Epoch 3, Average Training Loss: 0.4601857930910392
Epoch 3, Validation Accuracy: 0.7714285714285715
Epoch 4, Average Training Loss: 0.29011209302994073
Epoch 4, Validation Accuracy: 0.7767857142857143
Training complete!


Model 2 - RoBERTa base

In [60]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification


In [61]:
data = pd.read_csv("/Users/annikalu/Desktop/Capstone_Project/Winter/df_2023 - df_2023.csv")  # 包含columns: ["text", "label"]
data.dropna(subset=['text'], inplace=True)  
data['text'] = data['text'].astype(str) 

In [62]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.strip()
    return text

data['text'] = data['text'].apply(clean_text)
sentiment_mapping = {"positive": 0, "negative": 1, "neutral": 2}
data['sentiment'] = data['sentiment'].map(sentiment_mapping)

train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['sentiment'], test_size=0.1)

In [63]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.tolist()))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels.tolist()))

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [64]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=8)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
epochs = 4
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [66]:
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    print(f"Epoch {epoch+1}, Average Training Loss: {total_train_loss / len(train_loader)}")

    model.eval()
    total_eval_accuracy = 0
    for batch in val_loader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (predictions == inputs['labels']).cpu().numpy().mean()
    
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Accuracy: {avg_val_accuracy}")



Epoch 1, Average Training Loss: 0.7570660212000863
Epoch 1, Validation Accuracy: 0.6107142857142858
Epoch 2, Average Training Loss: 0.6387593933060521
Epoch 2, Validation Accuracy: 0.6910714285714287
Epoch 3, Average Training Loss: 0.5205966463220901
Epoch 3, Validation Accuracy: 0.6821428571428572
Epoch 4, Average Training Loss: 0.35721185937768124
Epoch 4, Validation Accuracy: 0.6642857142857144


Model 3 - XLNet

In [67]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup
import numpy as np
import re

In [68]:
data = pd.read_csv("/Users/annikalu/Desktop/Capstone_Project/Winter/df_2023 - df_2023.csv")  # 包含columns: ["text", "label"]
data.dropna(subset=['text'], inplace=True)  
data['text'] = data['text'].astype(str) 

In [69]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.strip()
    return text

data['text'] = data['text'].apply(clean_text)
sentiment_mapping = {"positive": 0, "negative": 1, "neutral": 2}
data['sentiment'] = data['sentiment'].map(sentiment_mapping)

train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['sentiment'], test_size=0.1)

In [70]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), 
                              torch.tensor(train_encodings['attention_mask']), 
                              torch.tensor(train_labels.tolist()))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']), 
                            torch.tensor(val_encodings['attention_mask']), 
                            torch.tensor(val_labels.tolist()))

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

  train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
  torch.tensor(train_encodings['attention_mask']),
  val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
  torch.tensor(val_encodings['attention_mask']),


In [72]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=3)
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [73]:
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=8)

epochs = 4
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [74]:
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    print(f"Epoch {epoch+1}, Average Training Loss: {total_train_loss / len(train_loader)}")

    model.eval()
    total_eval_accuracy = 0
    for batch in val_loader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (predictions == inputs['labels']).cpu().numpy().mean()
    
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Accuracy: {avg_val_accuracy}")

print("Training complete!")

Epoch 1, Average Training Loss: 0.7699176620264523
Epoch 1, Validation Accuracy: 0.6446428571428572
Epoch 2, Average Training Loss: 0.6520734876394272
Epoch 2, Validation Accuracy: 0.6982142857142858
Epoch 3, Average Training Loss: 0.546017056605855
Epoch 3, Validation Accuracy: 0.6857142857142857
Epoch 4, Average Training Loss: 0.4073847284937491
Epoch 4, Validation Accuracy: 0.7035714285714285
Training complete!


Model-Final Version

In [85]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_cosine_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import numpy as np

In [86]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)  # 尝试不同的batch_size
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=8)

In [88]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [89]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)  # 细调学习率和eps

CosineAnnealingLR

In [90]:
epochs = 4
total_steps = len(train_loader) * epochs
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [91]:
early_stopping_patience = 3  # 增加patience值
early_stopping_counter = 0
best_val_accuracy = 0

In [92]:
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        batch = [item.to(device) for item in batch]
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step() 
    
    print(f"Epoch {epoch+1}, Average Training Loss: {total_train_loss / len(train_loader)}")
    

    model.eval()
    total_eval_accuracy = 0
    for batch in val_loader:
        batch = [item.to(device) for item in batch]
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (predictions == inputs['labels']).cpu().numpy().mean()
    
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Accuracy: {avg_val_accuracy}")
    

    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        early_stopping_counter = 0
        torch.save(model.state_dict(), 'best_model_state.bin')
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

print("Training complete!")

Epoch 1, Average Training Loss: 0.7526052743196487
Epoch 1, Validation Accuracy: 0.8035714285714286
Epoch 2, Average Training Loss: 0.608486216576373
Epoch 2, Validation Accuracy: 0.8160714285714287
Epoch 3, Average Training Loss: 0.4697254466228798
Epoch 3, Validation Accuracy: 0.8071428571428572
Epoch 4, Average Training Loss: 0.38130947123052644
Epoch 4, Validation Accuracy: 0.7892857142857144
Training complete!


In [93]:
model.eval()
predictions = []
actuals = []
for batch in val_loader:
    batch = [item.to(device) for item in batch]
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    labels = batch[2]
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)
    predictions.extend(pred_labels.cpu().numpy())
    actuals.extend(labels.cpu().numpy())


errors = [(pred, actual, idx) for idx, (pred, actual) in enumerate(zip(predictions, actuals)) if pred != actual]


In [94]:
errors

[(0, 2, 2),
 (2, 0, 4),
 (0, 2, 5),
 (0, 2, 14),
 (2, 0, 20),
 (2, 0, 24),
 (2, 0, 35),
 (0, 2, 37),
 (2, 0, 38),
 (2, 0, 39),
 (2, 0, 41),
 (2, 0, 44),
 (2, 0, 51),
 (0, 2, 55),
 (0, 2, 65),
 (0, 2, 68),
 (2, 0, 78),
 (0, 2, 88),
 (2, 0, 90),
 (2, 1, 95),
 (2, 0, 99),
 (0, 2, 100),
 (0, 2, 107)]

In [95]:
for pred, actual, idx in errors:
    print(f"Index: {idx}, Predicted: {pred}, Actual: {actual}")
    print(f"Text: {val_texts.iloc[idx]}")
    print("---------")

Index: 2, Predicted: 0, Actual: 2
Text: Comments on Issue  Record Issues in Scope Circumvention and Covered Merchandise Inquiries for Companion AD and CVD OrdersWe note that there are still some ambiguities in Commerces practice that have caused distress and inconvenience to stakeholders and are not fully addressed by Commerces proposed rule For example while Commerce has launched a large number of anticircumvention inquiries it has not adequately published the initiation or determination announcements of the cases as it has done so on the Dumping and Subsidy Notification website We hope that Commerce will take note of this aspect so that relevant stakeholders can more easily search for anticircumvention cases Comments on Issue  Examples of Proposed Rules for Scope Clarification at Any Stage This revision significantly expands the jurisdiction of Commerce in defining the scope of antidumping and countervailing duty thereby intensifying uncertainty and compliance risks Specifically we i

In [102]:
train_data = pd.DataFrame({'text': train_texts, 'sentiment': train_labels})
val_data = pd.DataFrame({'text': val_texts, 'sentiment': val_labels})

In [103]:
train_sentiment_counts = train_data['sentiment'].value_counts()
print("Training set sentiment distribution:")
print(train_sentiment_counts)

Training set sentiment distribution:
sentiment
2    581
0    360
1     35
Name: count, dtype: int64


In [104]:
val_sentiment_counts = val_data['sentiment'].value_counts()
print("Validation set sentiment distribution:")
print(val_sentiment_counts)

Validation set sentiment distribution:
sentiment
2    68
0    40
1     1
Name: count, dtype: int64


In [106]:
total_data_points = len(data['text'])
total_data_points

1085

In [96]:
import torch
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

model.eval()
predictions = []
actuals = []
for batch in val_loader:
    batch = [item.to(device) for item in batch]
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    labels = batch[2]
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)
    predictions.extend(pred_labels.cpu().numpy())
    actuals.extend(labels.cpu().numpy())


accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")

report = classification_report(actuals, predictions, target_names=['positive', 'negative', 'neutral'])
print(report)

precision = precision_score(actuals, predictions, average='weighted')
recall = recall_score(actuals, predictions, average='weighted')
f1 = f1_score(actuals, predictions, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.7890
              precision    recall  f1-score   support

    positive       0.74      0.70      0.72        40
    negative       0.00      0.00      0.00         1
     neutral       0.82      0.85      0.83        68

    accuracy                           0.79       109
   macro avg       0.52      0.52      0.52       109
weighted avg       0.78      0.79      0.78       109

Precision: 0.7800
Recall: 0.7890
F1 Score: 0.7841


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Prediction

In [8]:
import torch
from transformers import BertForSequenceClassification

In [9]:
model_path = "/Users/annikalu/Desktop/Capstone_Project/Winter/best_model_state.bin"
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
model.eval()  # 設置模型為評估模式
print("Model parameters loaded")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model parameters loaded


In [10]:
model_path = "/Users/annikalu/Desktop/Capstone_Project/Winter/complete_model.pth"
torch.save(model, model_path)
print(f"Complete model saved to {model_path}")

# 加載完整模型
model = torch.load(model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
model.eval()
print("Complete model loaded")

Complete model saved to /Users/annikalu/Desktop/Capstone_Project/Winter/complete_model.pth
Complete model loaded


In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
import pandas as pd
import json
import re

In [2]:
model_path = "/Users/annikalu/Desktop/Capstone_Project/Winter/best_model_state.bin"
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print("Model parameters loaded")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model parameters loaded


In [3]:
def load_jsonl(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    data = [json.loads(line) for line in lines if '2023' not in line] 
    return data

unlabeled_data_path = "/Users/annikalu/Desktop/Capstone_Project/Winter/WTO_0123.jsonl"
unlabeled_data = load_jsonl(unlabeled_data_path)

In [14]:
def unified_preprocess(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.strip()
    text = text.lower()
    return text

In [15]:
for item in unlabeled_data:
    item['text'] = unified_preprocess(item['text'])

In [17]:
texts = [item['text'] for item in unlabeled_data]

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)


In [37]:
dataset = TensorDataset(
    torch.tensor(encodings['input_ids'], dtype=torch.long),
    torch.tensor(encodings['attention_mask'], dtype=torch.long)
)
dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=16)

results = []

In [38]:
with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask = [item.to(device) for item in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        results.extend(predictions)

In [39]:
for i, item in enumerate(unlabeled_data):
    item['sentiment'] = int(results[i])

In [40]:
df = pd.DataFrame(unlabeled_data)
output_csv_path = "/Users/annikalu/Desktop/Capstone_Project/Winter/labeled_data.csv"
df.to_csv(output_csv_path, index=False)

In [36]:
import pandas as pd
import json
import re
import fitz  # PyMuPDF
import requests
from io import BytesIO
from docx import Document
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [7]:
processed_data_path = "/Users/annikalu/Desktop/Capstone_Project/Winter/blank_text_ vader.csv"
processed_df = pd.read_csv(processed_data_path)
processed_df = pd.DataFrame(processed_df)

In [13]:
unlabeled_df = pd.DataFrame(unlabeled_data)

In [15]:
pdf_data = unlabeled_df[unlabeled_df['id'].isin(processed_df['id'])]

In [23]:
pdf_data = pdf_data.dropna(subset=['downLoadUrl'])

In [31]:
df = pd.DataFrame(pdf_data)
output_csv_path = "/Users/annikalu/Desktop/Capstone_Project/Winter/urls_data.csv"
df.to_csv(output_csv_path, index=False)

In [37]:
def unified_preprocess(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.strip()
    text = text.lower()
    return text

In [38]:
def extract_text_from_pdf(url):
    try:
        response = requests.get(url)
        pdf_document = fitz.open(stream=BytesIO(response.content), filetype="pdf")
        text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading PDF from {url}: {e}")
        return np.nan

def extract_text_from_csv(url):
    try:
        response = requests.get(url)
        csv_content = response.content.decode('utf-8')
        csv_data = pd.read_csv(BytesIO(csv_content.encode('utf-8')))
        text = csv_data.to_string()
        return text
    except Exception as e:
        print(f"Error reading CSV from {url}: {e}")
        return np.nan

def extract_text_from_doc(url):
    try:
        response = requests.get(url)
        doc = Document(BytesIO(response.content))
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        print(f"Error reading DOC from {url}: {e}")
        return np.nan

def extract_text_from_txt(url):
    try:
        response = requests.get(url)
        text = response.content.decode('utf-8')
        return text
    except Exception as e:
        print(f"Error reading TXT from {url}: {e}")
        return np.nan

def process_file(url):
    if url.endswith('.pdf'):
        text = extract_text_from_pdf(url)
    elif url.endswith('.csv'):
        text = extract_text_from_csv(url)
    elif url.endswith('.doc') or url.endswith('.docx'):
        text = extract_text_from_doc(url)
    elif url.endswith('.txt'):
        text = extract_text_from_txt(url)
    else:
        print(f"Unsupported file type: {url}")
        return np.nan
    
    if isinstance(text, str):
        return unified_preprocess(text)
    else:
        return text

In [39]:
pdf_data['text'] = pdf_data['downLoadUrl'].apply(lambda url: process_file(url))

MuPDF error: format error: No default Layer config

Unsupported file type: https://s3.amazonaws.com/voxgov-static/extracted/D230621DB601930ABB35D1C346722113892ADE5CE1C50AD6BA4EE1E0C311F763.xlsx
MuPDF error: format error: No default Layer config

MuPDF error: format error: object out of range (614 0 R); xref size 581

MuPDF error: format error: object out of range (581 0 R); xref size 581

MuPDF error: format error: object out of range (582 0 R); xref size 581

MuPDF error: format error: object out of range (581 0 R); xref size 581

MuPDF error: format error: object out of range (614 0 R); xref size 581

MuPDF error: format error: object out of range (583 0 R); xref size 581

MuPDF error: format error: object out of range (614 0 R); xref size 581

MuPDF error: format error: object out of range (614 0 R); xref size 581

MuPDF error: format error: object out of range (583 0 R); xref size 581

MuPDF error: format error: object out of range (583 0 R); xref size 581

MuPDF error: format erro

In [40]:
pdf_data = pdf_data.dropna(subset=['text'])
pdf_data = pdf_data[pdf_data['text'].astype(bool)]

In [47]:
df_1 = pd.DataFrame(pdf_data)
output_csv_path = "/Users/annikalu/Desktop/Capstone_Project/AWS_EC2/after_urls_data.csv"
df_1.to_csv(output_csv_path, index=False)

In [41]:
print(pdf_data['text'].head())

84     audit\nseptember  \nreport \noffice of audits ...
87     f street nw  floor   \nwashington dc  \ntel   ...
94     no  \n \n \nin the united states court of appe...
95     climate \nimpact \nof plastics\njuly \nmckinse...
108    september   \n \nms michelle arsenault \nnatio...
Name: text, dtype: object


In [43]:
model_path = "/Users/annikalu/Desktop/Capstone_Project/Winter/best_model_state.bin"
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print("Model parameters loaded")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model parameters loaded


In [48]:
# 提取文本数据
texts = pdf_data['text'].tolist()

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)

# 创建 dataset 和 dataloader
dataset = TensorDataset(
    torch.tensor(encodings['input_ids'], dtype=torch.long),
    torch.tensor(encodings['attention_mask'], dtype=torch.long)
)

In [None]:
batch_size = 4 
num_batches = (len(dataset) + batch_size - 1) // batch_size

def analyze_batch(start, end):
    sub_dataset = TensorDataset(
        dataset.tensors[0][start:end],
        dataset.tensors[1][start:end]
    )
    dataloader = DataLoader(sub_dataset, batch_size=batch_size, shuffle=False)

    batch_results = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = [item.to(device) for item in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            batch_results.extend(predictions)
    return batch_results

results = []
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(dataset))
    results.extend(analyze_batch(start, end))

pdf_data['sentiment'] = results