<a href="https://colab.research.google.com/github/wangyeye66/projects/blob/main/Copy_of_Twitter_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Loading Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Load dataset
data_columns = ["target", "ids", "date", "flag", "user", "text"]
data_encoding = "ISO-8859-1"
file_path = '/content/drive/MyDrive/colab data/twitter_sentiment.csv'

df_total = pd.DataFrame()

# Read the file in chunks
chunksize = 10000  # Adjust based on your needs and memory constraints
try:
    for chunk in pd.read_csv(file_path, encoding=data_encoding, names=data_columns, header=None, chunksize=chunksize, on_bad_lines='skip'):
        df_total = pd.concat([df_total, chunk], ignore_index=True)
except pd.errors.ParserError as e:
    print(f"Skipping a problematic chunk: {e}")

# Check the DataFrame
df_total.head()

Skipping a problematic chunk: Error tokenizing data. C error: EOF inside string starting at row 2042802


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df=df_total.copy()

# drop unnecessary columns
df = df.drop(['ids', 'date', 'flag', 'user'], axis=1)

In [6]:
# a valid 'target' value should have a length of 1
df['target'] = df['target'].astype(str)

is_valid = df['target'].apply(lambda x: len(x)==1)
df = df[is_valid]

In [7]:
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
# handle missing value
print(f'Before dropna: {df.shape}')
df.dropna(subset=['text'], inplace=True)
print(f'After dropna: {df.shape}')

print(df.isnull().any())

Before dropna: (2039996, 2)
After dropna: (2039892, 2)
target    False
text      False
dtype: bool


In [28]:
# randomly select 20000 pieces of data
df = df.sample(n=20000)


The 'target' is binary, so we change 4 to 1
- 0 -> Negative
- 1 -> Positive

In [29]:
df['target'] = df['target'].astype(int).replace(4, 1)

In [30]:
df['target'].value_counts()

0    14069
1     5931
Name: target, dtype: int64

In [31]:
# Split the data
train_txt, test_txt, train_label, test_label = train_test_split(df['text'], df['target'], test_size=.2)

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [32]:
# Convert texts to list after ensuring there are no NaN values
train_txt_list = train_txt.tolist()
test_txt_list = test_txt.tolist()

# Ensure lists are being passed to the tokenizer
train_encodings = tokenizer(train_txt_list, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_txt_list, truncation=True, padding=True, max_length=128)


### Tokenization

### Dataset Class Setup

In [33]:
class TwitterDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TwitterDataset(train_encodings, train_label.tolist())
val_dataset = TwitterDataset(test_encodings, test_label.tolist())

### Model and Training Setup

Bert

In [43]:
import torch
if torch.cuda.is_available():
    print("CUDA (GPU) is available and enabled!")
    device = torch.device("cuda")
else:
    print("CUDA (GPU) is not available. Training will be slow using CPU.")
    device = torch.device("cpu")

# Ensure CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

CUDA (GPU) is available and enabled!
Using device: cuda


In [35]:
# !pip install transformers[torch] -U

In [46]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc, roc_auc_score
import numpy as np

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',          # Directory where the model predictions and checkpoints will be written.
    num_train_epochs=3,              # Total number of training epochs to perform.
    per_device_train_batch_size=16,  # Batch size per device during training.
    per_device_eval_batch_size=64,   # Batch size for evaluation.
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler.
    weight_decay=0.01,               # Weight decay if we apply some.
    logging_dir='./logs',            # Directory for storing logs.
    logging_steps=10,                # Log every X updates steps.
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    load_best_model_at_end=True,      # Load the best model when finished training (default metric is loss)
    save_strategy="epoch",            # Save strategy to adopt during training
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    probs = probs[:, 1]   # Probability of positive class
    fpr, tpr, thresholds = roc_curve(labels, probs, pos_label=1)
    roc_auc = auc(fpr, tpr)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'roc_auc':roc_auc
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.3206,0.362681,0.8435,0.838834,0.839295,0.8435,0.899859
2,0.1986,0.420687,0.8465,0.84355,0.842913,0.8465,0.903897
3,0.101,0.653252,0.85175,0.851116,0.850614,0.85175,0.901627


Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=3000, training_loss=0.26735584640006227, metrics={'train_runtime': 380.0991, 'train_samples_per_second': 126.283, 'train_steps_per_second': 7.893, 'total_flos': 3083332680000000.0, 'train_loss': 0.26735584640006227, 'epoch': 3.0})

In [48]:
evaluation_results = trainer.evaluate()

In [50]:
res_bert = evaluation_results.copy()

Roberta

In [59]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from transformers import RobertaTokenizer

model_name = 'roberta-base'
# tokenization
tokenizer = RobertaTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_txt_list, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_txt_list, truncation=True, padding=True, max_length=128)
train_dataset = TwitterDataset(train_encodings, train_label.tolist())
val_dataset = TwitterDataset(test_encodings, test_label.tolist())

In [60]:
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.3565,0.355982,0.84975,0.849462,0.849202,0.84975,0.903818
2,0.2412,0.33235,0.86275,0.86209,0.861599,0.86275,0.920424
3,0.1771,0.504075,0.8665,0.866088,0.865747,0.8665,0.922134


Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=3000, training_loss=0.30979972146948176, metrics={'train_runtime': 403.5773, 'train_samples_per_second': 118.936, 'train_steps_per_second': 7.434, 'total_flos': 3157332664320000.0, 'train_loss': 0.30979972146948176, 'epoch': 3.0})

In [61]:
evaluation_results = trainer.evaluate()
res_roberta = evaluation_results.copy()

In [78]:
torch.cuda.empty_cache()

Bart

In [79]:
from transformers import BartForSequenceClassification, BartTokenizer, Trainer, TrainingArguments

model_name = "facebook/bart-large"
# tokenization
tokenizer = BartTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_txt_list, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_txt_list, truncation=True, padding=True, max_length=128)
train_dataset = TwitterDataset(train_encodings, train_label.tolist())
val_dataset = TwitterDataset(test_encodings, test_label.tolist())


In [80]:
model = BartForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB. GPU 0 has a total capacty of 15.77 GiB of which 14.38 MiB is free. Process 138071 has 15.76 GiB memory in use. Of the allocated memory 14.58 GiB is allocated by PyTorch, and 823.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
evaluation_results = trainer.evaluate()
res_bart = evaluation_results.copy()

In [81]:
pd.DataFrame(res_bert)

ValueError: If using all scalar values, you must pass an index

In [83]:
res_bert,res_roberta

({'eval_loss': 0.3626812994480133,
  'eval_accuracy': 0.8435,
  'eval_f1': 0.8388342710730207,
  'eval_precision': 0.8392949968478417,
  'eval_recall': 0.8435,
  'eval_roc_auc': 0.8998590791416261,
  'eval_runtime': 7.1436,
  'eval_samples_per_second': 559.94,
  'eval_steps_per_second': 8.819,
  'epoch': 3.0},
 {'eval_loss': 0.3323499858379364,
  'eval_accuracy': 0.86275,
  'eval_f1': 0.8620898169673088,
  'eval_precision': 0.8615987627578551,
  'eval_recall': 0.86275,
  'eval_roc_auc': 0.9204240269004881,
  'eval_runtime': 7.9556,
  'eval_samples_per_second': 502.792,
  'eval_steps_per_second': 7.919,
  'epoch': 3.0})