In [None]:
XLNET

In [8]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Text preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenizing
    words = word_tokenize(text)
    # Removing stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    # Joining words back to text
    text = ' '.join(words)
    return text

# Load the dataset
df = pd.read_csv('threshold1.csv')

# Preprocess the labels
df['target'] = df['target'].apply(lambda x: 1 if x.lower() == 'genuine' else 0)

# Apply text preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Use a smaller subset of the dataset for faster prototyping
df = df.sample(min(500, len(df)))  # Adjust this number based on your dataset size

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

# Parameters
BATCH_SIZE = 8  # Reduced batch size to fit within memory constraints
MAX_LENGTH = 128

# Create the dataset objects
train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

# Create the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training function
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

# Evaluation function
def eval_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    accuracy = correct_predictions.double() / len(dataloader.dataset)
    avg_loss = total_loss / len(dataloader)
    return accuracy, avg_loss

# Train and evaluate the model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

EPOCHS = 3  # Set to 1 epoch for now

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    val_acc, val_loss = eval_model(model, test_dataloader, device)

    print(f'Train loss: {train_loss}, Val loss: {val_loss}, Val accuracy: {val_acc}')

# # Save the model
# model.save_pretrained('xlnet_model')
# tokenizer.save_pretrained('xlnet_model')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training: 100%|██████████| 50/50 [08:14<00:00,  9.88s/it]
Evaluating: 100%|██████████| 13/13 [00:40<00:00,  3.11s/it]


Train loss: 0.610278702378273, Val loss: 0.5797981573985174, Val accuracy: 0.73
Epoch 2/3


Training: 100%|██████████| 50/50 [07:27<00:00,  8.94s/it]
Evaluating: 100%|██████████| 13/13 [00:41<00:00,  3.23s/it]


Train loss: 0.5691194725036621, Val loss: 0.6036588962261493, Val accuracy: 0.73
Epoch 3/3


Training: 100%|██████████| 50/50 [07:53<00:00,  9.48s/it]
Evaluating: 100%|██████████| 13/13 [00:40<00:00,  3.15s/it]

Train loss: 0.5718985909223556, Val loss: 0.5575610078298129, Val accuracy: 0.73





BERT MODEL

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

# Check if CUDA is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load the dataset
df = pd.read_csv('threshold1.csv')

# Convert 'genuine' and 'fake' labels to numerical labels
df['target'] = df['target'].map({'genuine': 1, 'fake': 0})

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128)

# Convert to torch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels.values)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels.values)

# Create DataLoader objects with optimization
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32, num_workers=4)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32, num_workers=4)

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to(device)

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 1  # 1 epoch
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train_model():
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

# Evaluation function
def evaluate_model():
    model.eval()
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).tolist())
            true_labels.extend(b_labels.tolist())
    print("Accuracy:", accuracy_score(true_labels, preds))
    print("Classification Report:")
    print(classification_report(true_labels, preds))

# Train and evaluate the model
for epoch in range(3): 
    print(f"Epoch {epoch + 1}")
    train_model()
    evaluate_model()

# # Save the model
# model.save_pretrained('bert_model')
# tokenizer.save_pretrained('bert_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
Average training loss: 0.5814712208050948
Accuracy: 0.7254901960784313
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.73      1.00      0.84        74

    accuracy                           0.73       102
   macro avg       0.36      0.50      0.42       102
weighted avg       0.53      0.73      0.61       102

Epoch 2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average training loss: 0.5714523012821491
Accuracy: 0.7254901960784313
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.73      1.00      0.84        74

    accuracy                           0.73       102
   macro avg       0.36      0.50      0.42       102
weighted avg       0.53      0.73      0.61       102

Epoch 3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average training loss: 0.5597430857328268
Accuracy: 0.7254901960784313
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.73      1.00      0.84        74

    accuracy                           0.73       102
   macro avg       0.36      0.50      0.42       102
weighted avg       0.53      0.73      0.61       102



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


HUGG

In [None]:
pip install pandas torch transformers scikit-learn nltk


Collecting torch
  Downloading torch-2.3.1-cp312-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.5.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (12 kB)
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting filelock (from torch)
  Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy (from torch)
  Downloading sympy-1.12.1-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Col

In [4]:
pip install --upgrade numpy


Collecting numpy
  Using cached numpy-2.0.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Using cached numpy-2.0.0-cp312-cp312-macosx_14_0_arm64.whl (5.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.42.3 requires numpy<2.0,>=1.17, but you have numpy 2.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.0.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Load the dataset
df = pd.read_csv('threshold.csv', nrows=10000)
df['target'] = df['target'].map({'genuine': 1, 'fake': 0})
df['text'] = df['text'].apply(preprocess_text)

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128)

# Convert to torch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels.values)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels.values)

# Create DataLoader objects
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32, num_workers=4)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32, num_workers=4)

# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training function
def train_model():
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

# Evaluation function
def evaluate_model():
    model.eval()
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).tolist())
            true_labels.extend(b_labels.tolist())
    print("Accuracy:", accuracy_score(true_labels, preds))
    print("Classification Report:")
    print(classification_report(true_labels, preds))

# Train and evaluate the model
for epoch in range(3):
    print(f"Epoch {epoch + 1}")
    train_model()
    evaluate_model()

# # Save the model
# model.save_pretrained('distilbert_model')
# tokenizer.save_pretrained('distilbert_model')


[nltk_data] Downloading package punkt to /Users/ravi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ravi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1




Average training loss: 0.46977042019367216
Accuracy: 0.7995
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.44      0.49       438
           1       0.85      0.90      0.88      1562

    accuracy                           0.80      2000
   macro avg       0.70      0.67      0.68      2000
weighted avg       0.79      0.80      0.79      2000

Epoch 2
Average training loss: 0.37010245403647424
Accuracy: 0.8145
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.40      0.49       438
           1       0.85      0.93      0.89      1562

    accuracy                           0.81      2000
   macro avg       0.73      0.67      0.69      2000
weighted avg       0.80      0.81      0.80      2000

Epoch 3
Average training loss: 0.2901136727929115
Accuracy: 0.8185
Classification Report:
              precision    recall  f1-score   support

           0       0.70    