In [1]:
# Install necessary libraries
!pip install pandas scikit-learn transformers torch datasets

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score , precision_score, recall_score

# Step 1: Load and Preprocess Dataset
def load_data(file_path):
    """Load dataset from a CSV file and check for required columns."""
    data = pd.read_excel(file_path)
#     data = data.sample(frac=0.04, random_state=42)  # Random 40%

    if 'tweets_english' not in data.columns:
        raise KeyError("The dataset must contain a 'text' column with tweet data.")
    if 'labels' not in data.columns:
        print("Warning: No 'label' column found. Assuming this is the unlabeled data.")
        data['labels'] = None  # Add a label column with NaN values if missing
    return data


def preprocess_text(df):
    """Preprocess text data."""
    df['tweets_english'] = df['tweets_english'].str.lower()
    df['tweets_english'] = df['tweets_english'].str.replace(r"http\S+|www\S+|https\S+", '', regex=True)  # remove URLs
    df['tweets_english'] = df['tweets_english'].str.replace(r'\@\w+|\#', '', regex=True)  # remove mentions and hashtags
    df['tweets_english'] = df['tweets_english'].str.replace(r'[^A-Za-z\s]', '', regex=True)  # remove non-alphanumeric chars
    return df

# Step 2: Define Labeled and Unlabeled Dataset
from sklearn.model_selection import train_test_split

def split_data(data, label_column='labels', labeled_size=0.2, unlabeled_size=0.6, test_size=0.2):
    """Split dataset into labeled, unlabeled, and test data."""
    # Ensure the sum of labeled_size, unlabeled_size, and test_size equals 1
    assert labeled_size + unlabeled_size + test_size == 1, "The sizes must sum to 1."
    
    # Treat all data as unlabeled, including the rows with labels
    
    
    # Split the data into train (80%) and test (20%) sets
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
    
    # Now, from the train_data (80%), we select 20% as labeled data
    labeled_data, unlabeled_data = train_test_split(train_data, test_size=0.75, random_state=42)
    
    return labeled_data,  test_data , unlabeled_data




# Step 3: Initialize Model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize(batch):
    # Convert each item in the 'tweets' list to string using list comprehension
    batch['tweets_english'] = [str(item) for item in batch['tweets_english']]  
    return tokenizer(batch['tweets_english'], padding='max_length', truncation=True, max_length=128)
# Step 4: Training Arguments
def setup_training_args():
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch"
    )
    return training_args

# Step 5: Initialize Trainer and Model Training
# ... (your existing code) ...

# Step 5: Initialize Trainer and Model Trainin
from datasets import Dataset  
def train_model(train_data, eval_data, model, training_args):
    # Convert DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
    eval_dataset = Dataset.from_pandas(eval_data.reset_index(drop=True))
    
    # Tokenize datasets
    train_dataset = train_dataset.map(tokenize, batched=True)
    eval_dataset = eval_dataset.map(tokenize, batched=True)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,  # Use Hugging Face Datasets
        eval_dataset=eval_dataset,    # Use Hugging Face Datasets
        compute_metrics=compute_metrics
    )
    trainer.train()
    return trainer

# ... (rest of your code) ...
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Step 6: Iterative Data Harvesting - Semi-Supervised Learning
# Step 6: Iterative Data Harvesting - Semi-Supervised Learning
def predict_with_confidence(trainer, data, threshold=0.9, batch_size=16):  # Added batch_size parameter
    """Predicts labels for unlabeled data with high confidence."""
    # Check if 'data' has any rows
    if data.empty:
        return pd.DataFrame(columns=data.columns)  # Return empty DataFrame

    # Convert each item in the 'tweets' list to string using list comprehension
    # and filter out any empty strings
    valid_tweets = [str(item) for item in data['tweets_english'].tolist() if str(item).strip()]
    
    # Check if there are any valid tweets
    if not valid_tweets:
        return pd.DataFrame(columns=data.columns)  # Return empty DataFrame
    
    # Process data in batches to reduce memory usage
    all_high_confidence_indices = []
    for i in range(0, len(valid_tweets), batch_size):
        batch_tweets = valid_tweets[i : i + batch_size]
        inputs = tokenizer(batch_tweets, return_tensors="pt", truncation=True, padding=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        high_confidence_indices = (probs.max(dim=1).values > threshold).nonzero(as_tuple=True)[0]
        all_high_confidence_indices.extend(high_confidence_indices.cpu().numpy() + i)  # Adjust indices

    # If no high confidence predictions, return empty DataFrame
    if not all_high_confidence_indices:
        return pd.DataFrame(columns=data.columns)

    return data.iloc[all_high_confidence_indices]

def iterative_harvesting(trainer, labeled_data, unlabeled_data, iterations=1, threshold=0.9):
    for iteration in range(iterations):
        high_conf_data = predict_with_confidence(trainer, unlabeled_data, threshold)
        print(len(labeled_data))
        labeled_data = pd.concat([labeled_data, high_conf_data])
        print(len(labeled_data))
        unlabeled_data = unlabeled_data.drop(high_conf_data.index)
        trainer.train()  # Retrain on expanded labeled data
    return labeled_data, trainer

# Step 7: Zero-Shot Learning Integration (optional)
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
candidate_labels = ["depression", "no depression"]



# Step 8: Evaluation Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)  # Get the predicted class labels
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='binary')
    recall = recall_score(labels, preds, average='binary')
    f1 = f1_score(labels, preds, average='binary')
    
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}



# Main Pipeline Function
def evaluate_model(trainer, eval_data):
    # Re-tokenize eval_data to ensure consistency
#     print("a")
    eval_dataset = Dataset.from_pandas(eval_data.reset_index(drop=True))
#     print("b")
    eval_dataset = eval_dataset.map(tokenize, batched=True)
#     print("c")
    # Run evaluation
    final_results = trainer.evaluate(eval_dataset=eval_dataset)
#     print("s")
    return final_results

def run_pipeline(file_path):
    # Load and preprocess data
    data = load_data(file_path)
    data = preprocess_text(data)

    # Define labeled and unlabeled datasets
    train_data, eval_data, unlabeled_data = split_data(data)
    
    print(train_data)
    print(eval_data)
    print(unlabeled_data)
    # Initialize training arguments and trainer
    training_args = setup_training_args()
    print("training started.")
    trainer = train_model(train_data, eval_data, model, training_args)
    print("training ended.")
    # Iterative Data Harvesting
    print("Iterative Data Harvesting started.")
    final_labeled_data, final_trainer = iterative_harvesting(trainer, train_data, unlabeled_data)
    print("Iterative Data Harvesting ended.")

    # Final Evaluation
    print("Final Evaluation started.")
    
    final_results = evaluate_model(final_trainer, eval_data)
    print("Final model results:")
    print(f"Accuracy: {final_results['eval_accuracy']:.4f}")
    print(f"Precision: {final_results['eval_precision']:.4f}")
    print(f"Recall: {final_results['eval_recall']:.4f}")
    print(f"F1 Score: {final_results['eval_f1']:.4f}")
    return final_labeled_data, final_results

# Run the pipeline with your dataset file path
file_path = 'Bangla2_translated.xlsx'
run_pipeline(file_path)

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
2024-11-12 17:08:33.096794: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-12 17:08:33.314189: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-12 17:08:33.375428: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-12 17:08:34.44612

                                                 tweets  labels  \
2740  আমার রাত জাগা টা সার্থক অভিনন্দন বাংলাদেশ ক্রি...       0   
3615  দ্বীনদার স্বামী একজন নেককার স্ত্রীর জন্য পৃথিব...       0   
2064  আ লীগ আবার ক্ষমতায় এলে ৫ জি সেবা চালু হবে : জ...       0   
1693  আর্কিটেক্ট অব ডিজিটাল বাংলাদেশ তরুণ প্রজন্মের ...       0   
3240  আত্ম সম্মানবোধহারা , চরিত্রহারা মানুষের সংগ্রা...       0   
...                                                 ...     ...   
3632  ভদ্রলোক নিকোটিন ছাড়ার আশায় নিকোটিন গাম খাওয়...       1   
1847  রাসূলুল্লাহ সা ইরশাদ করেন , প্রত্যেক রাতের যখন...       0   
300   সিলেট সিটি কর্পোরেশন নির্বাচনে মেয়র পদে বিএনপ...       0   
3448  তালাক এখন অনেক মুসলিম নারীর কাছে এককালীন টাকা ...       0   
1475  দ্বিতীয় মৃত্যুবার্ষিকীতে হাজার চুরাশির মা স্ব...       0   

                                         tweets_english  
2740  waking up my night is worthwhile congratulatio...  
3615  devout husband is the best in the world for a ...  
2064  g service will 



Map:   0%|          | 0/782 [00:00<?, ? examples/s]

Map:   0%|          | 0/783 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mw

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.434244,0.787995,0.628866,0.319372,0.423611
2,No log,0.453124,0.813538,0.634731,0.554974,0.592179
3,No log,0.501,0.808429,0.617143,0.565445,0.590164


training ended.
Iterative Data Harvesting started.
782
2396


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.706262,0.793103,0.579235,0.554974,0.566845
2,No log,0.937431,0.809706,0.626506,0.544503,0.582633
3,No log,0.960984,0.803321,0.605714,0.554974,0.579235


2396
3010


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.964388,0.807152,0.613636,0.565445,0.588556
2,No log,1.149329,0.799489,0.611842,0.486911,0.542274
3,No log,1.164517,0.793103,0.586826,0.513089,0.547486


3010
3079


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.261432,0.798212,0.654206,0.366492,0.469799
2,No log,1.373985,0.795658,0.63964,0.371728,0.470199
3,No log,1.30207,0.798212,0.606452,0.492147,0.543353


Iterative Data Harvesting ended.
Final Evaluation started.


Map:   0%|          | 0/783 [00:00<?, ? examples/s]

Final model results:
Accuracy: 0.7982
Precision: 0.6065
Recall: 0.4921
F1 Score: 0.5434


(                                                 tweets  labels  \
 2740  আমার রাত জাগা টা সার্থক অভিনন্দন বাংলাদেশ ক্রি...       0   
 3615  দ্বীনদার স্বামী একজন নেককার স্ত্রীর জন্য পৃথিব...       0   
 2064  আ লীগ আবার ক্ষমতায় এলে ৫ জি সেবা চালু হবে : জ...       0   
 1693  আর্কিটেক্ট অব ডিজিটাল বাংলাদেশ তরুণ প্রজন্মের ...       0   
 3240  আত্ম সম্মানবোধহারা , চরিত্রহারা মানুষের সংগ্রা...       0   
 ...                                                 ...     ...   
 2027  স্বপ্নগুলো ধূলিসাৎ হয়ে যায় আর কষ্টগুলি মিথ্য...       0   
 2982  বাস্তব তুমি যাদের জন্য জীবন দিতে প্রস্তুত ? তা...       0   
 2941  বেশ ্যা যদি নষ্ট হয় পুরুষ কেন তার কাছে যায় প...       1   
 545   আমার না খুব হাসি পায় টুইটে আসলে , যখন দেখি কি...       1   
 3277  ১৯৯৯ সালে ১৮৫ ধারায় প্রস্তাবে বুদ্ধদেব ভট্টাচ...       0   
 
                                          tweets_english  
 2740  waking up my night is worthwhile congratulatio...  
 3615  devout husband is the best in the world for a ...  
 2064