In [1]:
import pandas as pd
import time
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import login
from tqdm import tqdm
from datasets import Dataset

In [2]:
# Authenticate with Hugging Face Hub
login(token="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")  # Replace with your Hugging Face token


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
# Load the dataset
file_path = "/kaggle/input/power-tr-train-2/power-tr-train.tsv"  
data = pd.read_csv(file_path, sep="\t")

In [4]:
from sklearn.model_selection import train_test_split

# Stratified sampling to select 20% of the data based on the label distribution
data_subset, _ = train_test_split(data, test_size=0.8, stratify=data["label"], random_state=42)

# Now data_subset is a 20% stratified sample of the original data
true_labels = data_subset["label"].tolist()



In [5]:
data_subset["modified_text"] = data_subset["text"].apply(lambda x: f"Konuşmacının partisinin iktidarda(etiket 0) olup olmadığını ya da muhalefette(etiket 1) mi olduğunu sınıflandırın: {x}")
modified_text_list = data_subset["modified_text"].tolist() 


In [6]:
# Load the Llama model and tokenizer for sequence classification
model_name = "meta-llama/Llama-3.2-1B"  # Replace with the actual model path or name if locally hosted
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=True, num_labels=2)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Initialize the pipeline for binary classification
classifier = pipeline(
    "text-classification", model=model, tokenizer=tokenizer, device=0  # Set device to GPU
)

In [8]:
# Perform batch inference with progress bar and estimated time
predictions = []
start_time = time.time()
last_print_time = start_time
for i in tqdm(range(0, len(modified_text_list), 8), desc="Running inference"):
    batch = modified_text_list[i:i + 8]
    batch_predictions = classifier(batch)
    predictions.extend(batch_predictions)
    elapsed_time = time.time() - start_time
    processed_samples = i + len(batch)
    estimated_total_time = (elapsed_time / processed_samples) * len(modified_text_list)
    remaining_time = estimated_total_time - elapsed_time

    # Print estimated time every 1 minute
    current_time = time.time()
    if current_time - last_print_time >= 60:
        print(f"\rEstimated time remaining: {remaining_time / 60:.2f} minutes")
        last_print_time = current_time

print() 

Running inference:   2%|▏         | 10/435 [00:25<14:57,  2.11s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Running inference:   6%|▌         | 26/435 [01:01<14:30,  2.13s/it]

Estimated time remaining: 16.08 minutes


Running inference:  12%|█▏        | 51/435 [02:04<18:40,  2.92s/it]

Estimated time remaining: 15.58 minutes


Running inference:  18%|█▊        | 77/435 [03:06<15:28,  2.59s/it]

Estimated time remaining: 14.45 minutes


Running inference:  24%|██▍       | 104/435 [04:07<13:33,  2.46s/it]

Estimated time remaining: 13.11 minutes


Running inference:  30%|███       | 131/435 [05:08<14:27,  2.85s/it]

Estimated time remaining: 11.90 minutes


Running inference:  35%|███▌      | 154/435 [06:08<12:22,  2.64s/it]

Estimated time remaining: 11.18 minutes


Running inference:  41%|████      | 179/435 [07:09<12:10,  2.85s/it]

Estimated time remaining: 10.21 minutes


Running inference:  47%|████▋     | 204/435 [08:10<09:28,  2.46s/it]

Estimated time remaining: 9.24 minutes


Running inference:  53%|█████▎    | 230/435 [09:11<07:32,  2.21s/it]

Estimated time remaining: 8.17 minutes


Running inference:  58%|█████▊    | 252/435 [10:12<09:10,  3.01s/it]

Estimated time remaining: 7.40 minutes


Running inference:  64%|██████▍   | 278/435 [11:14<06:05,  2.33s/it]

Estimated time remaining: 6.32 minutes


Running inference:  70%|██████▉   | 303/435 [12:14<04:44,  2.16s/it]

Estimated time remaining: 5.31 minutes


Running inference:  75%|███████▌  | 327/435 [13:14<05:14,  2.91s/it]

Estimated time remaining: 4.35 minutes


Running inference:  81%|████████▏ | 354/435 [14:15<03:42,  2.74s/it]

Estimated time remaining: 3.24 minutes


Running inference:  87%|████████▋ | 378/435 [15:17<02:24,  2.54s/it]

Estimated time remaining: 2.28 minutes


Running inference:  93%|█████████▎| 403/435 [16:17<01:23,  2.61s/it]

Estimated time remaining: 1.27 minutes


Running inference:  99%|█████████▉| 430/435 [17:18<00:09,  1.98s/it]

Estimated time remaining: 0.18 minutes


Running inference: 100%|██████████| 435/435 [17:30<00:00,  2.42s/it]







In [9]:
# Extract predictions
predicted_labels = [int(pred["label"].split("_")[-1]) for pred in predictions]  # Extract numeric label

In [10]:
# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average="binary")

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.5207
Precision: 0.5181
Recall: 0.9619
F1 Score: 0.6735


In [None]:
# Save the metrics and predictions to a file
results = pd.DataFrame({"text": texts, "true_label": true_labels, "predicted_label": predicted_labels})
results.to_csv("inference_results_with_metrics.csv", index=False)

print("Inference completed. Results saved to 'inference_results_with_metrics.csv'.")