In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emotion-dataset/validation.csv
/kaggle/input/emotion-dataset/training.csv
/kaggle/input/emotion-dataset/test.csv


In [3]:
import warnings
warnings.filterwarnings("ignore")

**BERT EMBEDDINGS**

In [1]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


df = pd.read_csv('/kaggle/input/emotion-dataset/training.csv')

# Tokenize and generate embeddings for each text in the dataset
embeddings = []
for text in df['text']:
    # Tokenize text
    encoded_input = tokenizer(text, return_tensors='pt')
    
    # Generate embeddings
    with torch.no_grad():
        output = model(**encoded_input)
    
    # Extract embeddings from BERT's output
    last_hidden_states = output.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
    embeddings.append(sentence_embedding)

# Add embeddings to the DataFrame
df['embeddings'] = embeddings

# Now df contains the original text, labels, and corresponding BERT embeddings
print(df)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

                                                    text  label  \
0                                i didnt feel humiliated      0   
1      i can go from feeling so hopeless to so damned...      0   
2       im grabbing a minute to post i feel greedy wrong      3   
3      i am ever feeling nostalgic about the fireplac...      2   
4                                   i am feeling grouchy      3   
...                                                  ...    ...   
15995  i just had a very brief time in the beanbag an...      0   
15996  i am now turning and i feel pathetic that i am...      0   
15997                     i feel strong and good overall      1   
15998  i feel like this was such a rude comment and i...      3   
15999  i know a lot but i feel so stupid because i ca...      0   

                                              embeddings  
0      [-0.029084358, 0.28403538, -0.15271895, 0.2054...  
1      [0.06209593, 0.35217133, 0.1729311, -0.0792969...  
2      [0.44962013

**SVC on  BERT**

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Assuming df contains the embeddings and labels

# Split data into features (embeddings) and labels
X = df['embeddings'].to_list()  # Features (embeddings)
y = df['label']  # Labels

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SVM classifier
svm = SVC(kernel='linear', C=1.0, random_state=42)

# Train the SVM classifier
svm.fit(X_train, y_train)

# Predict on the test set
y_pred = svm.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.62
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.73      0.67       946
           1       0.68      0.76      0.72      1021
           2       0.54      0.34      0.42       296
           3       0.58      0.48      0.53       427
           4       0.59      0.49      0.53       397
           5       0.44      0.27      0.34       113

    accuracy                           0.62      3200
   macro avg       0.57      0.51      0.53      3200
weighted avg       0.62      0.62      0.61      3200



**RFC on BERT**

In [6]:
#from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import accuracy_score, classification_report

# Assuming df contains the embeddings and labels

# Split data into features (embeddings) and labels
X = df['embeddings'].to_list()  # Features (embeddings)
y = df['label']  # Labels

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.52
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.75      0.59       946
           1       0.54      0.87      0.67      1021
           2       0.00      0.00      0.00       296
           3       0.67      0.11      0.19       427
           4       0.50      0.05      0.09       397
           5       0.00      0.00      0.00       113

    accuracy                           0.52      3200
   macro avg       0.37      0.30      0.26      3200
weighted avg       0.47      0.52      0.42      3200



**roberta embeddings**

In [None]:
#import torch
from transformers import RobertaModel, RobertaTokenizer
#import pandas as pd

# Load pre-trained RoBERTa model and tokenizer
model_name = 'roberta-base'  # You can use different variations of RoBERTa if needed
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)


# Tokenize and obtain RoBERTa embeddings
def get_roberta_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling of token embeddings
    return embeddings

# Apply the function to your dataset
embeddings_list = []
for row in df['text']:
    embedding = get_roberta_embeddings(row)
    embeddings_list.append(embedding)

# Concatenate the embeddings and add them as new columns in your dataset
embeddings_tensor = torch.cat(embeddings_list)
embeddings_df = pd.DataFrame(embeddings_tensor.numpy())

# Merge the original dataframe with the embeddings dataframe
result_df = pd.concat([df, embeddings_df], axis=1)
print(result_df)


**roberta emd**

In [2]:
import torch
from transformers import RobertaModel, RobertaTokenizer
import pandas as pd

# Load pre-trained RoBERTa model and tokenizer
model_name = 'roberta-base'  # You can use different variations of RoBERTa if needed
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

# Your existing DataFrame with 'text' column
# Assuming df contains the text data
# ...

# Tokenize and obtain RoBERTa embeddings
def get_roberta_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling of token embeddings
    return embeddings

# Apply the function to your dataset and store embeddings in a new column 'roberta_emb'
df['roberta_emb'] = df['text'].apply(lambda x: get_roberta_embeddings(x)[0].numpy())

# Display the DataFrame with the new 'roberta_emb' column containing embeddings
print(df)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                    text  label  \
0                                i didnt feel humiliated      0   
1      i can go from feeling so hopeless to so damned...      0   
2       im grabbing a minute to post i feel greedy wrong      3   
3      i am ever feeling nostalgic about the fireplac...      2   
4                                   i am feeling grouchy      3   
...                                                  ...    ...   
15995  i just had a very brief time in the beanbag an...      0   
15996  i am now turning and i feel pathetic that i am...      0   
15997                     i feel strong and good overall      1   
15998  i feel like this was such a rude comment and i...      3   
15999  i know a lot but i feel so stupid because i ca...      0   

                                              embeddings  \
0      [-0.029084232, 0.28403535, -0.15271899, 0.2054...   
1      [0.062095962, 0.35217127, 0.1729311, -0.079297...   
2      [0.44961

**SVC**

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features (embeddings) and labels
X = df['roberta_emb'].tolist()  # Features (embeddings)
y = df['label']  # Labels

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SVM classifier
svm = SVC(kernel='linear', C=1.0, random_state=42)

# Train the SVM classifier
svm.fit(X_train, y_train)

# Predict on the test set
y_pred = svm.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.66
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.75      0.70       946
           1       0.70      0.81      0.75      1021
           2       0.62      0.34      0.44       296
           3       0.62      0.53      0.57       427
           4       0.64      0.52      0.57       397
           5       0.53      0.35      0.42       113

    accuracy                           0.66      3200
   macro avg       0.63      0.55      0.58      3200
weighted avg       0.65      0.66      0.65      3200



**RFC**

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features (embeddings) and labels
X = df['roberta_emb'].tolist()  # Features (embeddings)
y = df['label']  # Labels

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.51
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.74      0.58       946
           1       0.52      0.86      0.65      1021
           2       0.00      0.00      0.00       296
           3       0.68      0.07      0.12       427
           4       0.85      0.06      0.10       397
           5       0.00      0.00      0.00       113

    accuracy                           0.51      3200
   macro avg       0.42      0.29      0.24      3200
weighted avg       0.50      0.51      0.41      3200



**ANN on roberta**

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Convert labels to numerical values using LabelEncoder
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['label'])

# Split the data into features (embeddings) and encoded labels
X = df['roberta_emb'].tolist()  # Features (embeddings)
y = df['encoded_labels']  # Encoded Labels

# Convert to PyTorch tensors
X = torch.tensor(X)
y = torch.tensor(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the neural network architecture
class ANN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Set input, hidden, and output sizes
input_size = len(X[0])
hidden_size = 128  # Adjust the hidden layer size as needed
output_size = len(label_encoder.classes_)

# Initialize the neural network model
model = ANN(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adjust learning rate as needed

# Training the model
num_epochs = 300  # Adjust the number of epochs as needed
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train.float())
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation on test data
with torch.no_grad():
    model.eval()
    outputs = model(X_test.float())
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test, predicted)
    print(f'Accuracy: {accuracy:.2f}')
    print('Classification Report:')
    print(classification_report(y_test, predicted))


Epoch [10/300], Loss: 1.5469
Epoch [20/300], Loss: 1.4947
Epoch [30/300], Loss: 1.4298
Epoch [40/300], Loss: 1.3619
Epoch [50/300], Loss: 1.2945
Epoch [60/300], Loss: 1.2341
Epoch [70/300], Loss: 1.1827
Epoch [80/300], Loss: 1.1385
Epoch [90/300], Loss: 1.1000
Epoch [100/300], Loss: 1.0663
Epoch [110/300], Loss: 1.0368
Epoch [120/300], Loss: 1.0108
Epoch [130/300], Loss: 0.9879
Epoch [140/300], Loss: 0.9675
Epoch [150/300], Loss: 0.9492
Epoch [160/300], Loss: 0.9328
Epoch [170/300], Loss: 0.9180
Epoch [180/300], Loss: 0.9048
Epoch [190/300], Loss: 0.8928
Epoch [200/300], Loss: 0.8820
Epoch [210/300], Loss: 0.8721
Epoch [220/300], Loss: 0.8629
Epoch [230/300], Loss: 0.8544
Epoch [240/300], Loss: 0.8464
Epoch [250/300], Loss: 0.8389
Epoch [260/300], Loss: 0.8317
Epoch [270/300], Loss: 0.8249
Epoch [280/300], Loss: 0.8184
Epoch [290/300], Loss: 0.8121
Epoch [300/300], Loss: 0.8060
Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

           0    

**ANN on bert**

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Convert labels to numerical values using LabelEncoder
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['label'])

# Split the data into features (embeddings) and encoded labels
X = df['embeddings'].tolist()  # Features (embeddings)
y = df['encoded_labels']  # Encoded Labels

# Convert to PyTorch tensors
X = torch.tensor(X)
y = torch.tensor(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the neural network architecture
class ANN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Set input, hidden, and output sizes
input_size = len(X[0])
hidden_size = 128  # Adjust the hidden layer size as needed
output_size = len(label_encoder.classes_)

# Initialize the neural network model
model = ANN(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adjust learning rate as needed

# Training the model
num_epochs = 300  # Adjust the number of epochs as needed
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train.float())
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation on test data
with torch.no_grad():
    model.eval()
    outputs = model(X_test.float())
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test, predicted)
    print(f'Accuracy: {accuracy:.2f}')
    print('Classification Report:')
    print(classification_report(y_test, predicted))

Epoch [10/300], Loss: 1.4214
Epoch [20/300], Loss: 1.2776
Epoch [30/300], Loss: 1.1884
Epoch [40/300], Loss: 1.1268
Epoch [50/300], Loss: 1.0794
Epoch [60/300], Loss: 1.0416
Epoch [70/300], Loss: 1.0099
Epoch [80/300], Loss: 0.9825
Epoch [90/300], Loss: 0.9588
Epoch [100/300], Loss: 0.9383
Epoch [110/300], Loss: 0.9201
Epoch [120/300], Loss: 0.9038
Epoch [130/300], Loss: 0.8887
Epoch [140/300], Loss: 0.8746
Epoch [150/300], Loss: 0.8613
Epoch [160/300], Loss: 0.8486
Epoch [170/300], Loss: 0.8366
Epoch [180/300], Loss: 0.8250
Epoch [190/300], Loss: 0.8139
Epoch [200/300], Loss: 0.8030
Epoch [210/300], Loss: 0.7926
Epoch [220/300], Loss: 0.7826
Epoch [230/300], Loss: 0.7727
Epoch [240/300], Loss: 0.7631
Epoch [250/300], Loss: 0.7534
Epoch [260/300], Loss: 0.7436
Epoch [270/300], Loss: 0.7340
Epoch [280/300], Loss: 0.7248
Epoch [290/300], Loss: 0.7156
Epoch [300/300], Loss: 0.7065
Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0    

**classification using bert**

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

df=pd.read_csv('/kaggle/input/emotion-dataset/training.csv')

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)  # num_classes is the number of unique classes in your dataset

# Assuming df contains the BERT embeddings and labels
X = df['text'].tolist()  # Text data
y = df['label']  # Target


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize text after splitting the data
tokenized_train = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
tokenized_test = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')

# Convert y_train to tensor
y_train = torch.tensor(y_train.values.astype(np.int64))  # Convert y_train to a tensor


# Create data loaders
train_data = torch.utils.data.TensorDataset(tokenized_train['input_ids'], tokenized_train['attention_mask'], torch.tensor(y_train))
train_loader = torch.utils.data.DataLoader(train_data, batch_size=20)

# Define training parameters
#batch_size = 16
epochs = 3
learning_rate = 2e-5



# Set optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} - Average Training Loss: {avg_train_loss:.4f}")

# Evaluate the model
model.eval()
test_input_ids = X_test['input_ids']
test_attention_mask = X_test['attention_mask']
with torch.no_grad():
    outputs = model(test_input_ids, attention_mask=test_attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  train_data = torch.utils.data.TensorDataset(tokenized_train['input_ids'], tokenized_train['attention_mask'], torch.tensor(y_train))


Epoch 1/3 - Average Training Loss: 0.5725


**distilbert**

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv('/kaggle/input/emotion-dataset/training.csv')  

# Display the first few rows to check the data

from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast

# Split the data into training and validation sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform labels for training data
train_labels_encoded = label_encoder.fit_transform(train_labels)

# Transform labels for validation data
test_labels_encoded = label_encoder.transform(test_labels)

import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create PyTorch Datasets
train_dataset = CustomDataset(train_encodings, train_labels_encoded)
test_dataset = CustomDataset(test_encodings, test_labels_encoded)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

from transformers import DistilBertForSequenceClassification, AdamW
from tqdm import tqdm

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

for epoch in range(3):  # Set your desired number of epochs
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
test_preds = []
test_true = []

for batch in tqdm(test_loader):
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        test_preds.extend(preds.cpu().detach().numpy())
        test_true.extend(labels.cpu().detach().numpy())

# Calculate accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_true, test_preds)
print(f"test Accuracy: {accuracy}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 800/800 [01:38<00:00,  8.15it/s]
Epoch 2: 100%|██████████| 800/800 [01:37<00:00,  8.18it/s]
Epoch 3: 100%|██████████| 800/800 [01:37<00:00,  8.20it/s]
100%|██████████| 200/200 [00:06<00:00, 29.55it/s]

test Accuracy: 0.9290625





In [4]:
import pickle

In [10]:
pickle.dump(model,open('/kaggle/working/modelsaved','wb'))

In [6]:
import joblib

In [8]:
joblib.dump(model,'savedmodel')

['savedmodel']