In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv('movie.csv')

# Display the first few rows of the dataset
print(df.head())

# Basic statistics of the dataset
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Distribution of the label column
print(df['label'].value_counts())


                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1
              label
count  40000.000000
mean       0.499525
std        0.500006
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
text     0
label    0
dtype: int64
label
0    20019
1    19981
Name: count, dtype: int64


In [5]:
# Data cleaning
df.dropna(subset=["text"], inplace=True)
df.drop_duplicates(subset=["text"], inplace=True)
text = df["text"]
labels = df["label"]

In [6]:
print(df['label'].value_counts())

label
1    19908
0    19815
Name: count, dtype: int64


In [7]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Initialize the tokenizer and model for sentiment analysis
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Truncate the text to the first 512 tokens
def truncate_text(text, tokenizer, max_length=512):
    tokenized = tokenizer(text, truncation=True, max_length=max_length)
    return tokenizer.decode(tokenized['input_ids'])

# Truncate a sample text
sample_text = df['text'][0]
truncated_text = truncate_text(sample_text, tokenizer)
print(f"Original text length: {len(sample_text.split())}")
print(f"Truncated text length: {len(truncated_text.split())}")

# Get predictions on truncated texts
truncated_predictions = sentiment_pipeline(truncated_text)
print(truncated_predictions)


Original text length: 151
Truncated text length: 157
[{'label': 'NEGATIVE', 'score': 0.9996956586837769}]


In [22]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Convert the dataframe to HuggingFace dataset
dataset = Dataset.from_pandas(df)

# Train-test split
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']
train_dataset = train_dataset.sample(n=1000, random_state=42)
test_dataset = test_dataset.sample(n=200, random_state=42)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Tokenization function
def tokenize_function(example, tokenizer, max_length=512):
    # Tokenize the text
    encoded_dict = tokenizer(example['text'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return encoded_dict

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
tokenized_test_dataset = test_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)


AttributeError: 'Dataset' object has no attribute 'sample'

In [12]:
# Tokenization function
def tokenize_function(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=512)

# Tokenize the datasets with batch processing
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)


Map: 100%|██████████| 31778/31778 [00:14<00:00, 2263.00 examples/s]
Map: 100%|██████████| 7945/7945 [00:03<00:00, 2284.86 examples/s]


In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


In [23]:
from transformers import Trainer, TrainingArguments
import torch

# Define training arguments with increased batch size and gradient accumulation
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,  # Increase batch size
    per_device_eval_batch_size=32,  # Increase batch size
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    num_train_epochs=1,
    weight_decay=0.01,
    
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

trainer.train()


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Define a compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Video which helped me:
https://youtu.be/QEaBAZQCtwE?si=4A7h0TPhLBHjtHn-

The model I have used:
https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english/tree/main