In [5]:
# Install necessary libraries
!pip install pandas scikit-learn transformers torch datasets

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score , precision_score, recall_score

# Step 1: Load and Preprocess Dataset
def load_data(file_path):
    """Load dataset from a CSV file and check for required columns."""
    data = pd.read_excel(file_path)
#     data = data.sample(frac=0.04, random_state=42)  # Random 40%

    if 'Tweets_english' not in data.columns:
        raise KeyError("The dataset must contain a 'text' column with tweet data.")
    if 'Labels' not in data.columns:
        print("Warning: No 'label' column found. Assuming this is the unlabeled data.")
        data['Labels'] = None  # Add a label column with NaN values if missing
    return data


def preprocess_text(df):
    """Preprocess text data."""
    df['Tweets_english'] = df['Tweets_english'].str.lower()
    df['Tweets_english'] = df['Tweets_english'].str.replace(r"http\S+|www\S+|https\S+", '', regex=True)  # remove URLs
    df['Tweets_english'] = df['Tweets_english'].str.replace(r'\@\w+|\#', '', regex=True)  # remove mentions and hashtags
    df['Tweets_english'] = df['Tweets_english'].str.replace(r'[^A-Za-z\s]', '', regex=True)  # remove non-alphanumeric chars
    return df

# Step 2: Define Labeled and Unlabeled Dataset
from sklearn.model_selection import train_test_split

def split_data(data, label_column='Labels', labeled_size=0.2, unlabeled_size=0.6, test_size=0.2):
    """Split dataset into labeled, unlabeled, and test data."""
    # Ensure the sum of labeled_size, unlabeled_size, and test_size equals 1
    assert labeled_size + unlabeled_size + test_size == 1, "The sizes must sum to 1."
    
    # Treat all data as unlabeled, including the rows with labels
    
    
    # Split the data into train (80%) and test (20%) sets
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
    
    # Now, from the train_data (80%), we select 20% as labeled data
    labeled_data, unlabeled_data = train_test_split(train_data, test_size=0.75, random_state=42)
    
    return labeled_data,  test_data , unlabeled_data




# Step 3: Initialize Model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize(batch):
    # Convert each item in the 'tweets' list to string using list comprehension
    batch['Tweets_english'] = [str(item) for item in batch['Tweets_english']]  
    return tokenizer(batch['Tweets_english'], padding='max_length', truncation=True, max_length=128)
# Step 4: Training Arguments
def setup_training_args():
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch"
    )
    return training_args

# Step 5: Initialize Trainer and Model Training
# ... (your existing code) ...

# Step 5: Initialize Trainer and Model Trainin
from datasets import Dataset  
def train_model(train_data, eval_data, model, training_args):
    train_data = train_data.rename(columns={'Labels': 'label'})
    eval_data = eval_data.rename(columns={'Labels': 'label'})
    train_data['label'] = train_data['label'].astype(int)
    eval_data['label'] = eval_data['label'].astype(int)
    # Convert DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
    eval_dataset = Dataset.from_pandas(eval_data.reset_index(drop=True))
    
    # Tokenize datasets
    train_dataset = train_dataset.map(tokenize, batched=True)
    eval_dataset = eval_dataset.map(tokenize, batched=True)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,  # Use Hugging Face Datasets
        eval_dataset=eval_dataset,    # Use Hugging Face Datasets
        compute_metrics=compute_metrics
    )
    trainer.train()
    return trainer

# ... (rest of your code) ...
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Step 6: Iterative Data Harvesting - Semi-Supervised Learning
# Step 6: Iterative Data Harvesting - Semi-Supervised Learning
def predict_with_confidence(trainer, data, threshold=0.9, batch_size=16):  # Added batch_size parameter
    """Predicts labels for unlabeled data with high confidence."""
    # Check if 'data' has any rows
    if data.empty:
        return pd.DataFrame(columns=data.columns)  # Return empty DataFrame

    # Convert each item in the 'tweets' list to string using list comprehension
    # and filter out any empty strings
    valid_tweets = [str(item) for item in data['Tweets_english'].tolist() if str(item).strip()]
    
    # Check if there are any valid tweets
    if not valid_tweets:
        return pd.DataFrame(columns=data.columns)  # Return empty DataFrame
    
    # Process data in batches to reduce memory usage
    all_high_confidence_indices = []
    for i in range(0, len(valid_tweets), batch_size):
        batch_tweets = valid_tweets[i : i + batch_size]
        inputs = tokenizer(batch_tweets, return_tensors="pt", truncation=True, padding=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        high_confidence_indices = (probs.max(dim=1).values > threshold).nonzero(as_tuple=True)[0]
        all_high_confidence_indices.extend(high_confidence_indices.cpu().numpy() + i)  # Adjust indices

    # If no high confidence predictions, return empty DataFrame
    if not all_high_confidence_indices:
        return pd.DataFrame(columns=data.columns)

    return data.iloc[all_high_confidence_indices]

def iterative_harvesting(trainer, labeled_data, unlabeled_data, iterations=1, threshold=0.9):
    for iteration in range(iterations):
        high_conf_data = predict_with_confidence(trainer, unlabeled_data, threshold)
        print(len(labeled_data))
        labeled_data = pd.concat([labeled_data, high_conf_data])
        print(len(labeled_data))
        unlabeled_data = unlabeled_data.drop(high_conf_data.index)
        trainer.train()  # Retrain on expanded labeled data
    return labeled_data, trainer

# Step 7: Zero-Shot Learning Integration (optional)
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


# Step 8: Evaluation Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)  # Get the predicted class labels
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='binary')
    recall = recall_score(labels, preds, average='binary')
    f1 = f1_score(labels, preds, average='binary')
    
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}



# Main Pipeline Function
def evaluate_model(trainer, eval_data):
    # Re-tokenize eval_data to ensure consistency
#     print("a")
    eval_dataset = Dataset.from_pandas(eval_data.reset_index(drop=True))
#     print("b")
    eval_dataset = eval_dataset.map(tokenize, batched=True)
#     print("c")
    # Run evaluation
    final_results = trainer.evaluate(eval_dataset=eval_dataset)
#     print("s")
    return final_results

def run_pipeline(file_path):
    # Load and preprocess data
    data = load_data(file_path)
    data = preprocess_text(data)

    # Define labeled and unlabeled datasets
    train_data, eval_data, unlabeled_data = split_data(data)
    
    print(train_data)
    print(eval_data)
    print(unlabeled_data)
    # Initialize training arguments and trainer
    training_args = setup_training_args()
    print("training started.")
    trainer = train_model(train_data, eval_data, model, training_args)
    print("training ended.")
    # Iterative Data Harvesting
    print("Iterative Data Harvesting started.")
    final_labeled_data, final_trainer = iterative_harvesting(trainer, train_data, unlabeled_data)
    print("Iterative Data Harvesting ended.")

    # Final Evaluation
    print("Final Evaluation started.")
    
    final_results = evaluate_model(final_trainer, eval_data)
    print("Final model results:")
    print(f"Accuracy: {final_results['eval_accuracy']:.4f}")
    print(f"Precision: {final_results['eval_precision']:.4f}")
    print(f"Recall: {final_results['eval_recall']:.4f}")
    print(f"F1 Score: {final_results['eval_f1']:.4f}")
    return final_labeled_data, final_results

# Run the pipeline with your dataset file path
file_path = 'spanish_translated.xlsx'
run_pipeline(file_path)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


      Unnamed: 0                                             Tweets  Labels  \
733          NaN                           El dolor me ha cambiado.       1   
1031         NaN  razones por las cuales los anti vaxxers son bu...       0   
499          NaN               me gustarÃ­a desaparecer por un rato       1   
793          NaN  Los antidepresivos son como la religiÃ³n, solo...       1   
766          NaN  Todo me agobia y sofoca, mejor exiliarme de la...       1   
...          ...                                                ...     ...   
850          NaN   Al parecer no valgo la pena ser amigo de alguien       1   
119          NaN  es como que a veces yo sola me dejo caer\n\nes...       1   
1609         NaN  Nombres de los 10 principales 1. Bob 2. Wayne ...       0   
2092         NaN  Día 6 de mi pérdida de peso El día 6 está aquí...       0   
884          NaN  Esos momentos en los que tengo ganas de enterr...       1   

                                         Tweets_eng



Map:   0%|          | 0/437 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.1726,0.940639,0.941463,0.932367,0.936893
2,No log,0.161118,0.952055,0.918919,0.985507,0.951049
3,No log,0.155722,0.952055,0.926606,0.975845,0.950588


training ended.
Iterative Data Harvesting started.
437
1701


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.181986,0.952055,0.951456,0.94686,0.949153
2,No log,0.202351,0.958904,0.931507,0.985507,0.957746
3,No log,0.193717,0.96347,0.948357,0.975845,0.961905


1701
1740


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.26928,0.947489,0.918182,0.975845,0.946136


KeyboardInterrupt: 