In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vect, y_train)

# Make predictions
y_pred = model.predict(X_test_vect)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
y_pred_proba = model.predict_proba(X_test_vect)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC Score:", roc_auc)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred[index]])

np.savetxt('result.txt', results, fmt='%s')


Accuracy: 0.8428927680798005
Precision: 0.7104682184812284
Recall: 0.8428927680798005
F1 Score: 0.7710358744545942
AUC-ROC Score: 0.7007607776838546


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Initialize and train the model (Random Forest Classifier)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vect, y_train)

# Make predictions
y_pred = model.predict(X_test_vect)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
y_pred_proba = model.predict_proba(X_test_vect)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC Score:", roc_auc)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred[index]])

np.savetxt('result.txt', results, fmt='%s')


Accuracy: 0.8428927680798005
Precision: 0.7104682184812284
Recall: 0.8428927680798005
F1 Score: 0.7710358744545942
AUC-ROC Score: 0.6052878745186437


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Initialize and train the model (Support Vector Machine Classifier)
model = SVC(kernel='linear', probability=True, random_state=42)
model.fit(X_train_vect, y_train)

# Make predictions
y_pred = model.predict(X_test_vect)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
y_pred_proba = model.predict_proba(X_test_vect)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC Score:", roc_auc)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred[index]])

np.savetxt('result.txt', results, fmt='%s')


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8428927680798005
Precision: 0.7104682184812284
Recall: 0.8428927680798005
F1 Score: 0.7710358744545942
AUC-ROC Score: 0.6238846623462009


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Initialize and train the model (Multinomial Naive Bayes Classifier)
model = MultinomialNB()
model.fit(X_train_vect, y_train)

# Make predictions
y_pred = model.predict(X_test_vect)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred[index]])

np.savetxt('result.txt', results, fmt='%s')


Accuracy: 0.8428927680798005
Precision: 0.7104682184812284
Recall: 0.8428927680798005
F1 Score: 0.7710358744545942


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Initialize and train the model (Gradient Boosting Classifier)
model = GradientBoostingClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vect, y_train)

# Make predictions
y_pred = model.predict(X_test_vect)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
y_pred_proba = model.predict_proba(X_test_vect)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC Score:", roc_auc)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred[index]])

np.savetxt('result.txt', results, fmt='%s')


Accuracy: 0.8379052369077307
Precision: 0.7650935951261087
Recall: 0.8379052369077307
F1 Score: 0.7772555088345333
AUC-ROC Score: 0.6270310885695501


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Initialize and train the model (K-Nearest Neighbors Classifier)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_vect, y_train)

# Make predictions
y_pred = model.predict(X_test_vect)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred[index]])

np.savetxt('result.txt', results, fmt='%s')


Accuracy: 0.8428927680798005
Precision: 0.7919338932266311
Recall: 0.8428927680798005
F1 Score: 0.7800243527641448


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.callbacks import EarlyStopping
import numpy as np

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Encode the target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
max_words = 1000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Initialize the model (Recurrent Neural Network with LSTM)
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

# Train the model
batch_size = 32
epochs = 10
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=epochs, batch_size=batch_size, callbacks=[early_stop])

# Make predictions
y_pred_proba = model.predict(X_test_pad)
y_pred = (y_pred_proba > 0.5).astype(int)

# Convert predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred.flatten())

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC Score:", roc_auc)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred_labels[index]])

np.savetxt('result.txt', results, fmt='%s')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping
Accuracy: 0.8428927680798005
Precision: 0.7104682184812284
Recall: 0.8428927680798005
F1 Score: 0.7710358744545942
AUC-ROC Score: 0.6446416831032216


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Initialize the model
rf_model = RandomForestClassifier(random_state=42)

# Set hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_vect, y_train)

# Get the best model from GridSearchCV
best_rf_model = grid_search.best_estimator_

# Make predictions
y_pred = best_rf_model.predict(X_test_vect)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
y_pred_proba = best_rf_model.predict_proba(X_test_vect)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC Score:", roc_auc)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred[index]])

np.savetxt('result.txt', results, fmt='%s')


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Accuracy: 0.8428927680798005
Precision: 0.7104682184812284
Recall: 0.8428927680798005
F1 Score: 0.7710358744545942
AUC-ROC Score: 0.6052878745186437


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Convert labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length for input to the neural network
max_len = 200  # Max length of the sequence
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_len))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=64, callbacks=[early_stopping])

# Make predictions
y_pred_prob = model.predict(X_test_padded)
y_pred = (y_pred_prob > 0.5).astype(int)

# Convert predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred.flatten())

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred_labels[index]])

np.savetxt('result.txt', results, fmt='%s')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Accuracy: 0.8428927680798005
Precision: 0.7104682184812284
Recall: 0.8428927680798005
F1 Score: 0.7710358744545942
ROC AUC Score: 0.632079459002536


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
pip install transformers



In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data and convert to input IDs
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_dict = {'yes': 1, 'no': 0}
y_train_encoded = torch.tensor([label_dict[label] for label in y_train])
y_test_encoded = torch.tensor([label_dict[label] for label in y_test])

# Create DataLoader for training and testing data
train_data = TensorDataset(X_train_encoded['input_ids'], X_train_encoded['attention_mask'], y_train_encoded)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=8)

test_data = TensorDataset(X_test_encoded['input_ids'], X_test_encoded['attention_mask'], y_test_encoded)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Train the model
model.train()
for epoch in range(3):  # Number of epochs
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch"):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f"Average training loss: {total_loss/len(train_dataloader)}")

# Evaluate the model
model.eval()
y_pred = []
y_true = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating", unit="batch"):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        y_pred.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Convert predictions to original labels
id2label = {v: k for k, v in label_dict.items()}
y_pred_labels = [id2label[pred] for pred in y_pred]

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
roc_auc = roc_auc_score(y_true, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

# Save results to result.txt file
results = []
for index, text in enumerate(X_test):
    results.append([text, y_pred_labels[index]])

np.savetxt('result.txt', results, fmt='%s')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 201/201 [02:24<00:00,  1.39batch/s]


Average training loss: 0.45209690813549713


Epoch 2: 100%|██████████| 201/201 [02:30<00:00,  1.34batch/s]


Average training loss: 0.42566787744339424


Epoch 3: 100%|██████████| 201/201 [02:30<00:00,  1.33batch/s]


Average training loss: 0.4108085306500321


Evaluating: 100%|██████████| 51/51 [00:13<00:00,  3.84batch/s]

Accuracy: 0.8428927680798005
Precision: 0.7104682184812284
Recall: 0.8428927680798005
F1 Score: 0.7710358744545942
ROC AUC Score: 0.5



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text data and convert to input IDs
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_dict = {'yes': 1, 'no': 0}
y_train_encoded = torch.tensor([label_dict[label] for label in y_train])
y_test_encoded = torch.tensor([label_dict[label] for label in y_test])

# Create DataLoader for training and testing data
train_data = TensorDataset(X_train_encoded['input_ids'], X_train_encoded['attention_mask'], y_train_encoded)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=8)

test_data = TensorDataset(X_test_encoded['input_ids'], X_test_encoded['attention_mask'], y_test_encoded)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)

# Load pre-trained RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training parameters
num_epochs = 3
best_val_loss = float('inf')
patience = 0

# Training loop
for epoch in range(num_epochs):
    logger.info(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_dataloader, desc="Training"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    logger.info(f"Average training loss: {avg_train_loss}")

    # Evaluation
    model.eval()
    total_eval_loss = 0
    all_preds = []
    all_labels = []

    for batch in tqdm(test_dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits
        total_eval_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(inputs['labels'].cpu().numpy())

    avg_eval_loss = total_eval_loss / len(test_dataloader)
    logger.info(f"Average evaluation loss: {avg_eval_loss}")

    # Calculate evaluation metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    roc_auc = roc_auc_score(all_labels, all_preds)

    logger.info(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, ROC AUC: {roc_auc}")

    # Early stopping based on validation loss
    if avg_eval_loss < best_val_loss:
        best_val_loss = avg_eval_loss
        torch.save(model.state_dict(), 'best_model.pt')
        patience = 0
    else:
        patience += 1
        if patience >= 3:
            logger.info("Early stopping...")
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Final evaluation on test set
model.eval()
all_preds = []
all_labels = []

for batch in tqdm(test_dataloader, desc="Testing"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    all_preds.extend(preds.cpu().numpy())
    all_labels.extend(inputs['labels'].cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')
roc_auc = roc_auc_score(all_labels, all_preds)

logger.info("Final Evaluation:")
logger.info(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, ROC AUC: {roc_auc}")

# Classification report
logger.info("Classification Report:")
logger.info(classification_report(all_labels, all_preds, target_names=label_dict.keys()))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 201/201 [02:26<00:00,  1.37it/s]
Evaluating: 100%|██████████| 51/51 [00:12<00:00,  4.07it/s]
  _warn_prf(average, modifier, msg_start, len(result))
Training: 100%|██████████| 201/201 [02:31<00:00,  1.32it/s]
Evaluating: 100%|██████████| 51/51 [00:12<00:00,  4.11it/s]
  _warn_prf(average, modifier, msg_start, len(result))
Training: 100%|██████████| 201/201 [02:32<00:00,  1.32it/s]
Evaluating: 100%|██████████| 51/51 [00:12<00:00,  4.12it/s]
  _warn_prf(average, modifier, msg_start, len(result))
Testing: 100%|██████████| 51/51 [00:12<00:00,  4.14it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text data and convert to input IDs
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_dict = {'yes': 1, 'no': 0}
y_train_encoded = torch.tensor([label_dict[label] for label in y_train])
y_test_encoded = torch.tensor([label_dict[label] for label in y_test])

# Create DataLoader for training and testing data
train_data = TensorDataset(X_train_encoded['input_ids'], X_train_encoded['attention_mask'], y_train_encoded)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=8)

test_data = TensorDataset(X_test_encoded['input_ids'], X_test_encoded['attention_mask'], y_test_encoded)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)

# Load pre-trained RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training parameters
num_epochs = 3
best_val_loss = float('inf')
patience = 0

# Training loop
for epoch in range(num_epochs):
    logger.info(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_dataloader, desc="Training"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    logger.info(f"Average training loss: {avg_train_loss}")

    # Evaluation
    model.eval()
    total_eval_loss = 0
    all_preds = []
    all_labels = []

    for batch in tqdm(test_dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits
        total_eval_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(inputs['labels'].cpu().numpy())

    avg_eval_loss = total_eval_loss / len(test_dataloader)
    logger.info(f"Average evaluation loss: {avg_eval_loss}")

    # Calculate evaluation metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    roc_auc = roc_auc_score(all_labels, all_preds)

    logger.info(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, ROC AUC: {roc_auc}")

    # Early stopping based on validation loss
    if avg_eval_loss < best_val_loss:
        best_val_loss = avg_eval_loss
        torch.save(model.state_dict(), 'best_model.pt')
        patience = 0
    else:
        patience += 1
        if patience >= 3:
            logger.info("Early stopping...")
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Final evaluation on test set
model.eval()
all_preds = []
all_labels = []

for batch in tqdm(test_dataloader, desc="Testing"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    all_preds.extend(preds.cpu().numpy())
    all_labels.extend(inputs['labels'].cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')
roc_auc = roc_auc_score(all_labels, all_preds)

logger.info("Final Evaluation:")
logger.info(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, ROC AUC: {roc_auc}")

# Classification report
logger.info("Classification Report:")
logger.info(classification_report(all_labels, all_preds, target_names=label_dict.keys()))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 201/201 [02:25<00:00,  1.38it/s]
Evaluating: 100%|██████████| 51/51 [00:12<00:00,  4.12it/s]
  _warn_prf(average, modifier, msg_start, len(result))
Training: 100%|██████████| 201/201 [02:31<00:00,  1.33it/s]
Evaluating: 100%|██████████| 51/51 [00:12<00:00,  4.12it/s]
  _warn_prf(average, modifier, msg_start, len(result))
Training: 100%|██████████| 201/201 [02:31<00:00,  1.33it/s]
Evaluating: 100%|██████████| 51/51 [00:12<00:00,  4.11it/s]
Testing: 100%|██████████| 51/51 [00:12<00:00,  4.12it/s]


In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: 1 if x[0][1] == 'yes' else 0)  # Convert labels to binary

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)

# Define dataset and dataloader for training and testing
train_dataset = CustomDataset(X_train.values, y_train.values, tokenizer, max_length=512)
test_dataset = CustomDataset(X_test.values, y_test.values, tokenizer, max_length=512)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model
model.train()
for epoch in range(3):  # Number of epochs
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Average training loss: {total_loss/len(train_dataloader)}")

# Evaluate the model
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|          | 0/401 [00:00<?, ?batch/s]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: 1 if x[0][1] == 'yes' else 0)  # Convert labels to binary

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)

# Define dataset and dataloader for training and testing
train_dataset = CustomDataset(X_train.values, y_train.values, tokenizer, max_length=512)
test_dataset = CustomDataset(X_test.values, y_test.values, tokenizer, max_length=512)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model
model.train()
for epoch in range(3):  # Number of epochs
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Average training loss: {total_loss/len(train_dataloader)}")

# Evaluate the model
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|          | 0/401 [00:00<?, ?batch/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: 1 if x[0][1] == 'yes' else 0)  # Convert labels to binary

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)

# Define dataset and dataloader for training and testing
train_dataset = CustomDataset(X_train.values, y_train.values, tokenizer, max_length=512)
test_dataset = CustomDataset(X_test.values, y_test.values, tokenizer, max_length=512)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)  # Reduced batch size
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Set device
device = torch.device("cpu")  # Run on CPU

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model
model.train()
for epoch in range(3):  # Number of epochs
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Average training loss: {total_loss/len(train_dataloader)}")

# Evaluate the model
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|          | 0/801 [00:07<?, ?batch/s]


AssertionError: Cannot handle batch sizes > 1 if no padding token is defined.

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: 1 if x[0][1] == 'yes' else 0)  # Convert labels to binary

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)

# Define dataset and dataloader for training and testing
train_dataset = CustomDataset(X_train.values, y_train.values, tokenizer, max_length=512)
test_dataset = CustomDataset(X_test.values, y_test.values, tokenizer, max_length=512)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)  # Reduced batch size
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model
model.train()
for epoch in range(3):  # Number of epochs
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Average training loss: {total_loss/len(train_dataloader)}")

# Evaluate the model
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|          | 0/801 [00:00<?, ?batch/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
pip install transformers pandas scikit-learn



In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import os


# Load the dataset from JSON file
with open('/content/primate_dataset.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Split the data into features (X) and target (y)
X = df['post_text']
y = df['annotations'].apply(lambda x: x[0][1])  # Extracting the label from annotations

# Convert labels to numerical values
label_map = {'yes': 1, 'no': 0}
y = y.map(label_map)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the text data and convert to input IDs and attention masks
max_length = 256  # Max length of the input sequence
X_train_encoding = tokenizer(X_train.tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
X_test_encoding = tokenizer(X_test.tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

# Create PyTorch datasets
train_dataset = TensorDataset(X_train_encoding['input_ids'], X_train_encoding['attention_mask'], torch.tensor(y_train.values))
test_dataset = TensorDataset(X_test_encoding['input_ids'], X_test_encoding['attention_mask'], torch.tensor(y_test.values))

# Define batch size and create data loaders
batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

model.to(device)

# Define optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define training parameters
epochs = 3

# Train the model
model.train()
for epoch in range(epochs):
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
y_pred = []
y_true = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        predictions = np.argmax(logits, axis=1)

        y_pred.extend(predictions)
        y_true.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
