In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from transformers import AdamW
import pandas as pd
import logging
import os

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class COVIDTweetClassifier(nn.Module):
    def __init__(self, num_classes=5, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.feature_layer = nn.Sequential(
            nn.Linear(768 + 3, 512),  # BERT dim + 3 handcrafted features
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        self.classifier = nn.Linear(512, num_classes)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask, features):
        bert_output = self.bert(input_ids, attention_mask).last_hidden_state[:, 0, :]
        combined = torch.cat([bert_output, features], dim=1)
        return self.classifier(self.feature_layer(combined))

    @staticmethod
    def extract_features(text):
        """Handcrafted features matching training feature extraction"""
        return torch.tensor([
            len(text.split()) / 100,        # Normalized word count
            text.count('!') + text.count('?'),  # Punctuation intensity
            1 if 'http' in text else 0     # URL presence
        ], dtype=torch.float32)

class TweetDataset(Dataset):
    def __init__(self, texts, features, labels, tokenizer, max_len=128):
        self.texts = texts
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'features': self.features[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.texts)



In [None]:
def load_data(data_path):
    df = pd.read_csv(data_path, encoding='latin1')
    df['text'] = df['OriginalTweet'].str.strip()
    df['label'] = df['Sentiment'].map({
        'Extremely Negative': 0, 'Negative': 1,
        'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4
    })
    features = [COVIDTweetClassifier.extract_features(text).numpy() for text in df['text']]
    return df['text'].tolist(), torch.tensor(features), df['label'].tolist()


In [None]:

def train():
    # Load data
    train_texts, train_features, train_labels = load_data('Corona_NLP_train.csv')
    test_texts, test_features, test_labels = load_data('Corona_NLP_test.csv')

    # Prepare datasets
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_dataset = TweetDataset(train_texts, train_features, train_labels, tokenizer)
    test_dataset = TweetDataset(test_texts, test_features, test_labels, tokenizer)

    # Training setup
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = COVIDTweetClassifier().to(device)
    model = nn.DataParallel(model)  # Enable multi-GPU support
    optimizer = AdamW(model.parameters(), lr=2e-5)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64)

    # Early stopping parameters
    best_f1 = 0
    patience = 2
    patience_counter = 0

    # Training loop
    for epoch in range(3):  # Reduced to 3 epochs for runtime optimization
        model.train()
        logging.info(f'Starting epoch {epoch+1}')
        for batch in train_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in test_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                outputs = model(**inputs)
                all_preds.extend(outputs.argmax(dim=1).cpu().numpy())
                all_labels.extend(batch['labels'].numpy())

        f1 = f1_score(all_labels, all_preds, average='weighted')
        logging.info(f"Epoch {epoch+1} | Test F1: {f1:.3f}")

        # Check for early stopping
        if f1 > best_f1:
            best_f1 = f1
            patience_counter = 0
            torch.save(model.state_dict(), 'covid_model.pth')
            logging.info('Model improved and saved.')
        else:
            patience_counter += 1
            logging.info('No improvement.')
            if patience_counter >= patience:
                logging.info('Early stopping triggered.')
                break

if __name__ == '__main__':
    train()


  return df['text'].tolist(), torch.tensor(features), df['label'].tolist()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import torch
class TweetPredictor:
    def __init__(self, model_path='covid_model.pth'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = COVIDTweetClassifier().to(self.device)

        # Fix 1: Handle DataParallel weights and security warning
        state_dict = torch.load(model_path, map_location=self.device, weights_only=True)

        # Fix 2: Remove 'module.' prefix from DataParallel-trained weights
        state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}

        self.model.load_state_dict(state_dict)
        self.model.eval()

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.class_map = {
            0: 'Extremely Negative', 1: 'Negative',
            2: 'Neutral', 3: 'Positive', 4: 'Extremely Positive'
        }

    def predict(self, text):
        features = COVIDTweetClassifier.extract_features(text).to(self.device)
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(self.device)

        with torch.no_grad():
            output = self.model(encoding['input_ids'], encoding['attention_mask'], features.unsqueeze(0))

        return self.class_map[output.argmax().item()]

#Example
if __name__ == '__main__':
    predictor = TweetPredictor()
    print(predictor.predict("Vaccine distribution is going great!"))

Extremely Positive


In [None]:
pip install onnxruntime

import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix


def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    cls_report = classification_report(all_labels, all_preds,
                                         target_names=['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive'])
    conf_matrix = confusion_matrix(all_labels, all_preds)
    return acc, precision, recall, f1, cls_report, conf_matrix

def main_evaluation():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Load your test data (ensure you have test_texts, test_features, test_labels)
    test_texts, test_features, test_labels = load_data('Corona_NLP_test.csv')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    test_dataset = TweetDataset(test_texts, test_features, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=64)

    # Load the trained model
    model = COVIDTweetClassifier().to(device)
    state_dict = torch.load('covid_model.pth', map_location=device)
    state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
    model.load_state_dict(state_dict)

    acc, precision, recall, f1, cls_report, conf_matrix = evaluate_model(model, test_loader, device)

    print(f"Test Accuracy: {acc:.3f}")
    print(f"Test Precision: {precision:.3f}")
    print(f"Test Recall: {recall:.3f}")
    print(f"Test F1 Score: {f1:.3f}")
    print("\nClassification Report:\n", cls_report)
    print("\nConfusion Matrix:\n", conf_matrix)

if __name__ == '__main__':
    main_evaluation()


  state_dict = torch.load('covid_model.pth', map_location=device)


Test Accuracy: 0.852
Test Precision: 0.855
Test Recall: 0.852
Test F1 Score: 0.853

Classification Report:
                     precision    recall  f1-score   support

Extremely Negative       0.90      0.84      0.87       592
          Negative       0.82      0.86      0.84      1041
           Neutral       0.94      0.85      0.89       619
          Positive       0.80      0.84      0.82       947
Extremely Positive       0.86      0.88      0.87       599

          accuracy                           0.85      3798
         macro avg       0.87      0.85      0.86      3798
      weighted avg       0.85      0.85      0.85      3798


Confusion Matrix:
 [[499  89   1   3   0]
 [ 52 893  22  72   2]
 [  1  45 524  49   0]
 [  2  58  11 794  82]
 [  0   2   0  71 526]]


In [None]:
pip install onnx

Collecting onnx
  Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.17.0


In [None]:
import torch

def export_to_onnx(model, dummy_input, onnx_file_path='covid_model.onnx'):
    model.eval()
    torch.onnx.export(
        model,
        dummy_input,
        onnx_file_path,
        input_names=['input_ids', 'attention_mask', 'features'],
        output_names=['output'],
        dynamic_axes={
            'input_ids': {0: 'batch_size'},
            'attention_mask': {0: 'batch_size'},
            'features': {0: 'batch_size'},
            'output': {0: 'batch_size'}
        },
        opset_version=14
    )
    print(f"Model exported to {onnx_file_path}")

# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = COVIDTweetClassifier().to(device)
state_dict = torch.load('covid_model.pth', map_location=device)
state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
model.load_state_dict(state_dict)

dummy_input_ids = torch.zeros(1, 128, dtype=torch.int64).to(device)
dummy_attention_mask = torch.zeros(1, 128, dtype=torch.int64).to(device)
dummy_features = torch.zeros(1, 3, dtype=torch.float32).to(device)

export_to_onnx(model, (dummy_input_ids, dummy_attention_mask, dummy_features))


  state_dict = torch.load('covid_model.pth', map_location=device)


Model exported to covid_model.onnx


In [None]:
import onnxruntime as ort
import numpy as np
from transformers import BertTokenizer

class ONNXTweetPredictor:
    def __init__(self, onnx_model_path='covid_model.onnx'):
        self.session = ort.InferenceSession(onnx_model_path)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.class_map = {
            0: 'Extremely Negative', 1: 'Negative',
            2: 'Neutral', 3: 'Positive', 4: 'Extremely Positive'
        }

    def predict(self, text):
        features = COVIDTweetClassifier.extract_features(text).numpy()
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='np'
        )

        input_ids = encoding['input_ids'].astype(np.int64)
        attention_mask = encoding['attention_mask'].astype(np.int64)
        features = features.astype(np.float32).reshape(1, -1)

        outputs = self.session.run(
            None,
            {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'features': features
            }
        )

        return self.class_map[np.argmax(outputs[0])]

# Example usage
onnx_predictor = ONNXTweetPredictor()
print(onnx_predictor.predict("Vaccine distribution is going great!"))


Extremely Positive


# Random Forest

Some literature have published that random forest could have a good results. So, I again used BERT to tokenize our text and then use random forest to train the model. Unfortunately, we can see that the results are not as good as the neural network. However, I included my code and thought process below.

一些文献表明，随机森林可能会取得不错的结果。因此，我再次使用 BERT 对文本进行分词，然后使用随机森林来训练模型。不幸的是，我们可以看到结果不如神经网络。然而，我在下面包含了我的代码和思路。

In [None]:
import logging
import pandas as pd
import numpy as np  # Ensure numpy is imported
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, classification_report,
                             confusion_matrix, precision_score, recall_score)
from sklearn.model_selection import train_test_split


In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class COVIDTweetClassifier(nn.Module):
    def __init__(self, num_classes=5, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.feature_layer = nn.Sequential(
            nn.Linear(768 + 3, 512),  # BERT dim + 3 handcrafted features
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        self.classifier = nn.Linear(512, num_classes)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask, features):
        bert_output = self.bert(input_ids, attention_mask).last_hidden_state[:, 0, :]
        combined = torch.cat([bert_output, features], dim=1)
        return self.classifier(self.feature_layer(combined))

    @staticmethod
    def extract_features(text):
        """Handcrafted features matching training feature extraction"""
        return torch.tensor([
            len(text.split()) / 100,         # Normalized word count
            text.count('!') + text.count('?'), # Punctuation intensity
            1 if 'http' in text else 0         # URL presence
        ], dtype=torch.float32)


In [None]:
class TweetDataset(Dataset):
    def __init__(self, texts, features, labels, tokenizer, max_len=128):
        self.texts = texts
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'features': self.features[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.texts)


In [None]:
def load_data(data_path):
    df = pd.read_csv(data_path, encoding='latin1')
    df['text'] = df['OriginalTweet'].str.strip()
    df['label'] = df['Sentiment'].map({
        'Extremely Negative': 0, 'Negative': 1,
        'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4
    })
    features = [COVIDTweetClassifier.extract_features(text).numpy() for text in df['text']]
    return df['text'].tolist(), torch.tensor(features), df['label'].tolist()

def train_random_forest():
    texts, features, labels = load_data('Corona_NLP_train.csv')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    dataset = TweetDataset(texts, features, labels, tokenizer)
    data_loader = DataLoader(dataset, batch_size=32, shuffle=False)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = COVIDTweetClassifier().to(device)
    model.eval()

    # Extract BERT features
    all_features = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            handcrafted_features = batch['features'].to(device)
            bert_output = model.bert(input_ids, attention_mask).last_hidden_state[:, 0, :]
            combined_features = torch.cat([bert_output, handcrafted_features], dim=1)
            all_features.append(combined_features.cpu().numpy())

    all_features = np.vstack(all_features)
    labels = np.array(labels)

    # Split data for training and validation
    X_train, X_val, y_train, y_val = train_test_split(all_features, labels, test_size=0.2, random_state=42)

    # Train Random Forest
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)

    # Evaluate Random Forest
    y_pred = rf_classifier.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')
    cls_report = classification_report(
        y_val, y_pred,
        target_names=['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
    )
    conf_matrix = confusion_matrix(y_val, y_pred)

    # Print out the metrics
    print(f"Random Forest Accuracy: {acc:.3f}")
    print(f"Random Forest Precision: {precision:.3f}")
    print(f"Random Forest Recall: {recall:.3f}")
    print(f"Random Forest F1 Score: {f1:.3f}")
    print("\nClassification Report:\n" + cls_report)
    print("\nConfusion Matrix:\n" + str(conf_matrix))

if __name__ == '__main__':
    train_random_forest()
