In [1]:
!pip install torch
!pip install transformers
!pip install pandas

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, AdamW

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.7 MB/s[0m eta [36m0:00:0

In [2]:
# Step 1: Load and preprocess the data
from google.colab import files
uploaded = files.upload()
import io
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv(io.BytesIO(uploaded['train-processed2.csv']),delimiter=',', header=None, names=['sentence_source', 'label', 'texts'])


# Perform any necessary data cleaning and preprocessing here

Saving train-processed2.csv to train-processed2.csv


In [3]:
df.sample(5)

Unnamed: 0,sentence_source,label,texts
3458,3459,0,tostring method added
7447,7448,2,sorry this bug is fixed
724,725,0,suggestion your account or email address does ...
8163,8164,0,when i deploy my application on jboss the ejbs...
1157,1158,0,pixel bender files dont run under ios see othe...


In [4]:
missing_values = df.isnull().sum()
print(missing_values)

sentence_source    0
label              0
texts              0
dtype: int64


In [5]:
print(df.dtypes)
print(df.shape)

sentence_source     int64
label               int64
texts              object
dtype: object
(9500, 3)


In [6]:
# Step 2: Split the data
train_df = df.sample(frac=0.7, random_state=42)
val_df = df.drop(train_df.index).sample(frac=0.5, random_state=42)
test_df = df.drop(train_df.index).drop(val_df.index)


In [7]:
# Step 3: Fine-tuning
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label)
        }

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the text sequences
tokens = tokenizer.batch_encode_plus(
    df['texts'].tolist(),
    truncation=True,
    padding=True,
    max_length=512,  # Adjust the maximum sequence length as needed
    return_tensors='pt'
)

# Create the input tensors
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
labels = torch.tensor(df['label'].tolist())

# Print the shape of the tensors
print("Input IDs shape:", input_ids.shape)
print("Attention Mask shape:", attention_mask.shape)
print("Labels shape:", labels.shape)

# Rest of the code for model inference or training


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Input IDs shape: torch.Size([9500, 512])
Attention Mask shape: torch.Size([9500, 512])
Labels shape: torch.Size([9500])


In [9]:
max_length = 128  # Adjust as needed

train_dataset = SentimentDataset(train_df['texts'].tolist(), train_df['label'].tolist(), tokenizer, max_length)
val_dataset = SentimentDataset(val_df['texts'].tolist(), val_df['label'].tolist(), tokenizer, max_length)
test_dataset = SentimentDataset(test_df['texts'].tolist(), test_df['label'].tolist(), tokenizer, max_length)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3)  # Adjust num_labels based on your task
model.config.pad_token_id = model.config.eos_token_id
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 5  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)

    model.eval()
    val_accuracy = 0
    val_total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            val_total += labels.size(0)
            val_accuracy += (predicted_labels == labels).sum().item()

    val_accuracy = val_accuracy / val_total

    print(f'Epoch {epoch+1}/{num_epochs} - Loss: {average_loss:.4f} - Val Accuracy: {val_accuracy:.4f}')



Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Loss: 0.7691 - Val Accuracy: 0.8133
Epoch 2/5 - Loss: 0.4776 - Val Accuracy: 0.8379
Epoch 3/5 - Loss: 0.4050 - Val Accuracy: 0.8442
Epoch 4/5 - Loss: 0.3580 - Val Accuracy: 0.8463
Epoch 5/5 - Loss: 0.3132 - Val Accuracy: 0.8456


In [10]:
# Step 5: Evaluation
model.eval()
test_accuracy = 0
test_total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)

        test_total += labels.size(0)
        test_accuracy += (predicted_labels == labels).sum().item()

test_accuracy = test_accuracy / test_total

print(f'Test Accuracy: {test_accuracy:.4f}')

# Step 6: Inference
# You can now use the fine-tuned model for sentiment analysis on new, unseen data.
# Pass tokenized input through the model and interpret the output predictions.


Test Accuracy: 0.8477


In [11]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [12]:
# Step 6: Save the model on Google Drive
save_path = '/content/gdrive/MyDrive/Models/GPT2Model10E/'

model.save_pretrained(save_path)

# The model will be saved as 'pytorch_model.bin' in the specified path
tokenizer.save_pretrained(save_path)


('/content/gdrive/MyDrive/Models/GPT2Model10E/tokenizer_config.json',
 '/content/gdrive/MyDrive/Models/GPT2Model10E/special_tokens_map.json',
 '/content/gdrive/MyDrive/Models/GPT2Model10E/vocab.json',
 '/content/gdrive/MyDrive/Models/GPT2Model10E/merges.txt',
 '/content/gdrive/MyDrive/Models/GPT2Model10E/added_tokens.json')

In [13]:
from transformers import GPTForSequenceClassification, GPTTokenizer

# Load the saved model and tokenizer from Google Drive
save_path = '/content/gdrive/MyDrive/Models/fine_tuned_model/'

model = GPTForSequenceClassification.from_pretrained(save_path)
tokenizer = GPTTokenizer.from_pretrained(save_path)


ImportError: ignored

In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load test data for prediction
from google.colab import files
import io

# Step 1: Load and preprocess the data
uploaded = files.upload()
test_data = pd.read_csv(io.BytesIO(uploaded['ebay-sa1.csv']),delimiter=',', header=None, names=['sentence_source', 'label', 'texts'])
# Preprocess the test data as needed


Saving ebay-sa1.csv to ebay-sa1.csv


In [44]:
# Create a dataset for prediction
prediction_dataset = SentimentDataset(test_data['texts'].tolist(), test_data['label'].tolist(), tokenizer, max_length)
prediction_loader = DataLoader(prediction_dataset, batch_size=16)

# Activate evaluation mode
model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in prediction_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].tolist()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)

        predictions.extend(predicted_labels.tolist())
        true_labels.extend(labels)

# Calculate evaluation metrics
#accuracy = accuracy_score(true_labels, predictions)
#precision = precision_score(true_labels, predictions, average='weighted')
#recall = recall_score(true_labels, predictions, average='weighted')
#f1 = f1_score(true_labels, predictions, average='weighted')

# Display evaluation metrics
#print('Accuracy:', accuracy)
#print('Precision:', precision)
#print('Recall:', recall)
#print('F1-Score:', f1)

# Save predictions to a CSV file on Google Drive
results_df = pd.DataFrame({'Text': test_data['texts'], 'True Label': true_labels, 'Predicted Label': predictions})


In [45]:
from google.colab import drive
drive.mount('/content/gdrive')
results_df.to_csv('/content/gdrive/MyDrive/ebay-sa1.csv', index=False)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
