In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [2]:
Paris_1 = pd.read_csv('Coding_Paris_Week1.csv')
Paris_2 = pd.read_csv('Coding_Paris_Week2.csv')
Paris_3 = pd.read_csv('Coding_Paris_Week3.csv')
Paris_4 = pd.read_csv('Coding_Paris_Week4.csv')
Paris_5 = pd.read_csv('Coding_Paris_Week5.csv')
Shawn_1 = pd.read_csv('Coding_Shawn_Week1.csv')
Shawn_2 = pd.read_csv('Coding_Shawn_Week2.csv')
Shawn_3 = pd.read_csv('Coding_Shawn_Week3.csv')
Shawn_4 = pd.read_csv('Coding_Shawn_Week4.csv')
Shawn_5 = pd.read_csv('Coding_Shawn_Week5.csv')
Tianli_1 = pd.read_csv('Coding_Tianli_Week1.csv')
Tianli_2 = pd.read_csv('Coding_Tianli_Week2.csv')
Tianli_3 = pd.read_csv('Coding_Tianli_Week3.csv')
Tianli_4 = pd.read_csv('Coding_Tianli_Week4.csv')
Tianli_5 = pd.read_csv('Coding_Tianli_Week5.csv')

In [3]:
# Combine all the data frames into one
combined_df = pd.concat([
    Paris_1, Paris_2, Paris_3, Paris_4, Paris_5,
    Shawn_1, Shawn_2, Shawn_3, Shawn_4, Shawn_5,
    Tianli_1, Tianli_2, Tianli_3, Tianli_4, Tianli_5
], ignore_index=True)

combined_df['Sentiment'] = combined_df['Sentiment'].astype(int)



# Check the combined data frame
combined_df

Unnamed: 0,Tweet,Sentiment
0,RT @biancale_monash ATTN: Aus women interested...,3
1,The future will be full of lab grown meat: htt...,1
2,The Future Of Meat: 45 In Vitro Meat Recipes Y...,4
3,Some makers of lab-grown meat have adopted a c...,3
4,Lab grown meat doesn’t sit well with me,2
...,...,...
22970,I've unexpectedly ended up with David Lewis on...,3
22971,cheap cultivated meat https://t.co/hsXLscDaVS,3
22972,"@BobsBlog I mean to be clear, it depends exact...",3
22973,The market for cultured meat is no joke (prese...,3


In [4]:
train_df = combined_df.sample(frac=0.8,random_state=2024)

validation_df = combined_df.drop(train_df.index)

In [6]:
train_df

Unnamed: 0,Tweet,Sentiment
16487,'Cultured meat' could spell end of traditional...,1
14558,Lab-grown meat is here – but will vegetarians ...,3
1756,RT @GoodFoodScience Did you miss last week's S...,3
7209,Home grown hamburgers? Ew!! - Is 'in vitro mea...,2
317,"50 years from now, real meat will be a luxury ...",1
...,...,...
5067,@Yea3601 @ItsMeChase1 @ShotgunWillard @unusual...,2
878,"RT @NinesCatudio So, would you eat cultivated ...",3
9590,Finally a dream comes true: in-vitro-meat aka ...,1
12755,Lab grown meat !!! Oh my its making my tummy t...,1


In [7]:
validation_df

Unnamed: 0,Tweet,Sentiment
10,@csimpsyo @Tbogin @jonlovett Cultured meat,3
13,RT @ndonyourtable What's the difference betwee...,3
14,#Technology #Tech Lab-Grown Meat Is Coming htt...,3
21,This year is the first time cultivated meat ha...,3
34,"RT @NewHarvestOrg 🍗and @UmaValeti, who co-foun...",3
...,...,...
22951,@Joseph_Plant What goes into lab grown meat? I...,2
22955,Google Funding Lab Grown Meat… No Animals Kill...,1
22958,RT @Orbyne #LSEForum cultured meat avoids the ...,1
22969,@MusadADroid @AuthorGusPegel The answer would ...,4


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Adjust labels from 1-4 to 0-3
train_tweets = train_df['Tweet'].tolist()
train_labels = [label - 1 for label in train_df['Sentiment'].tolist()]

val_tweets = validation_df['Tweet'].tolist()
val_labels = [label - 1 for label in validation_df['Sentiment'].tolist()]

# 2. Define a custom dataset
class SentimentDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'][0],  # Avoid flatten() for correct shape
            'attention_mask': encoding['attention_mask'][0],
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 3. Prepare the datasets and dataloaders
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3

train_dataset = SentimentDataset(
    tweets=train_tweets, labels=train_labels, tokenizer=tokenizer, max_len=MAX_LEN)
val_dataset = SentimentDataset(
    tweets=val_tweets, labels=val_labels, tokenizer=tokenizer, max_len=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 4. Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 5. Define optimizer, scheduler, and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

# 6. Training and validation function
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model.train()
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    predictions = []
    true_labels = []
    losses = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            losses.append(loss.item())
    
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, sum(losses) / len(losses)

# 7. Train the model
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler)
    
    # Evaluate on validation set after each epoch
    val_accuracy, val_loss = eval_model(model, val_loader, loss_fn, device)
    print(f"Validation loss: {val_loss:.4f}, Validation accuracy: {val_accuracy:.4f}")

# 8. Save the model
import os
save_path = "./sentiment_model"
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3
Validation loss: 0.5631, Validation accuracy: 0.7758
Epoch 2/3
Validation loss: 0.5313, Validation accuracy: 0.7909
Epoch 3/3
Validation loss: 0.5727, Validation accuracy: 0.7832


('./sentiment_model\\tokenizer_config.json',
 './sentiment_model\\special_tokens_map.json',
 './sentiment_model\\vocab.txt',
 './sentiment_model\\added_tokens.json')

In [10]:
# 1. Import necessary libraries
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# 2. Define the evaluation function
def evaluate_model(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []
    input_texts = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            tweets = batch["tweet_text"]  # Assuming 'text' is the key containing the tweet text

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            input_texts.extend(tweets)  # Add the input text

    return input_texts, true_labels, predictions

# 3. Evaluate the model on the validation set
val_input_texts, val_true_labels, val_predictions = evaluate_model(model, val_loader)

# 4. Calculate evaluation metrics
accuracy = accuracy_score(val_true_labels, val_predictions)
weighted_precision = precision_score(val_true_labels, val_predictions, average='weighted')
weighted_recall = recall_score(val_true_labels, val_predictions, average='weighted')
weighted_f1 = f1_score(val_true_labels, val_predictions, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Weighted Precision: {weighted_precision:.4f}')
print(f'Weighted Recall: {weighted_recall:.4f}')
print(f'Weighted F1 Score: {weighted_f1:.4f}')

# 5. Display the classification report
print('\nClassification Report:')
print(classification_report(val_true_labels, val_predictions, digits=4))

# 6. Create and save the DataFrame
results_df = pd.DataFrame({
    'Tweet': val_input_texts,
    'True Label': val_true_labels,
    'Predicted Label': val_predictions
})

# Save as a CSV file
results_df.to_csv('bert_validation_predictions_with_tweets_1113.csv', index=False)
print("Predictions have been saved to 'bert_validation_predictions_with_tweets.csv'")

Accuracy: 0.7832
Weighted Precision: 0.7892
Weighted Recall: 0.7832
Weighted F1 Score: 0.7855

Classification Report:
              precision    recall  f1-score   support

           0     0.6568    0.7343    0.6934      1084
           1     0.6056    0.6085    0.6071       424
           2     0.8806    0.8432    0.8615      2984
           3     0.2900    0.2816    0.2857       103

    accuracy                         0.7832      4595
   macro avg     0.6083    0.6169    0.6119      4595
weighted avg     0.7892    0.7832    0.7855      4595

Predictions have been saved to 'bert_validation_predictions_with_tweets.csv'


In [11]:
results_df

Unnamed: 0,Tweet,True Label,Predicted Label
0,@csimpsyo @Tbogin @jonlovett Cultured meat,2,2
1,RT @ndonyourtable What's the difference betwee...,2,2
2,#Technology #Tech Lab-Grown Meat Is Coming htt...,2,2
3,This year is the first time cultivated meat ha...,2,2
4,"RT @NewHarvestOrg 🍗and @UmaValeti, who co-foun...",2,2
...,...,...,...
4590,@Joseph_Plant What goes into lab grown meat? I...,1,1
4591,Google Funding Lab Grown Meat… No Animals Kill...,0,0
4592,RT @Orbyne #LSEForum cultured meat avoids the ...,0,0
4593,@MusadADroid @AuthorGusPegel The answer would ...,3,0
