In [None]:
import pandas as pd
from sklearn.utils import shuffle


twitter_df = pd.read_csv("/content/drive/MyDrive/Dataset/Twitter_Data_Updated.csv")
reddit_df = pd.read_csv("/content/drive/MyDrive/Dataset/Redit_Data_Updated.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Dataset/test.csv")

for df in [twitter_df, reddit_df, test_df]:
    if "label" in df.columns:
        df.rename(columns={"label": "category"}, inplace=True)
    if "text" in df.columns:
        df.rename(columns={"text": "text"}, inplace=True)


df = pd.concat([twitter_df, reddit_df, test_df], ignore_index=True)


df = df.dropna(subset=["text", "category"])


df["category"] = df["category"].replace({"Notr":"Neutral","notr":"Neutral","NOTR":"Neutral"})
label2id = {"Negative":0, "Neutral":1, "Positive":2}
df["label"] = df["category"].map(label2id)

min_count = df["label"].value_counts().min()
print("SÄ±nÄ±f baÅŸÄ±na kullanÄ±labilecek maksimum Ã¶rnek sayÄ±sÄ±:", min_count)

df_balanced = pd.concat([
    df[df["label"]==0].sample(min_count, random_state=42),
    df[df["label"]==1].sample(min_count, random_state=42),
    df[df["label"]==2].sample(min_count, random_state=42)
])


df_balanced = shuffle(df_balanced, random_state=42).reset_index(drop=True)

train_size = int(0.6 * len(df_balanced))
val_size = int(0.2 * len(df_balanced))

train_df = df_balanced.iloc[:train_size]
val_df = df_balanced.iloc[train_size:train_size+val_size]
test_df = df_balanced.iloc[train_size+val_size:]

train_df.to_csv("/content/drive/MyDrive/Dataset/train_balanced_max.csv", index=False)
val_df.to_csv("/content/drive/MyDrive/Dataset/val_balanced_max.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/Dataset/test_balanced_max.csv", index=False)

print("âœ… Her sette etiket daÄŸÄ±lÄ±mÄ± (maksimum veri ile dengeli)")
print("\nTrain:\n", train_df["category"].value_counts())
print("\nValidation:\n", val_df["category"].value_counts())
print("\nTest:\n", test_df["category"].value_counts())


SÄ±nÄ±f baÅŸÄ±na kullanÄ±labilecek maksimum Ã¶rnek sayÄ±sÄ±: 5656
âœ… Her sette etiket daÄŸÄ±lÄ±mÄ± (maksimum veri ile dengeli)

Train:
 category
Negative    3424
Neutral     3400
Positive    3356
Name: count, dtype: int64

Validation:
 category
Positive    1157
Negative    1137
Neutral     1099
Name: count, dtype: int64

Test:
 category
Neutral     1157
Positive    1143
Negative    1095
Name: count, dtype: int64


In [None]:

train_df.to_csv("/content/drive/MyDrive/Dataset/train_balanced_max.csv", index=False)
val_df.to_csv("/content/drive/MyDrive/Dataset/val_balanced_max.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/Dataset/test_balanced_max.csv", index=False)

print("âœ… Veriler Drive'a kaydedildi:")
print("/content/drive/MyDrive/Dataset/train_balanced_max.csv")
print("/content/drive/MyDrive/Dataset/val_balanced_max.csv")
print("/content/drive/MyDrive/Dataset/test_balanced_max.csv")


âœ… Veriler Drive'a kaydedildi:
/content/drive/MyDrive/Dataset/train_balanced_max.csv
/content/drive/MyDrive/Dataset/val_balanced_max.csv
/content/drive/MyDrive/Dataset/test_balanced_max.csv


In [None]:
!pip install transformers datasets torch --quiet

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

from sklearn.metrics import classification_report, confusion_matrix

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

train_df = pd.read_csv("/content/drive/MyDrive/Dataset/train_balanced_max.csv")
val_df = pd.read_csv("/content/drive/MyDrive/Dataset/val_balanced_max.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Dataset/test_balanced_max.csv")

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.to(device)


train_dataset = SentimentDataset(train_df["text"], train_df["label"], tokenizer)
val_dataset = SentimentDataset(val_df["text"], val_df["label"], tokenizer)
test_dataset = SentimentDataset(test_df["text"], test_df["label"], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


import numpy as np
classes, counts = np.unique(train_df["label"], return_counts=True)
total = len(train_df)
weights = [total/(len(classes)*c) for c in counts]
class_weights = torch.tensor(weights, dtype=torch.float).to(device)

loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=5e-5)


epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss/len(train_loader):.4f}")


model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\nðŸ§¾ Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative","Neutral","Positive"]))

print("\nðŸ“Š Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

model_save_path = "/content/drive/MyDrive/Dataset/xlm_roberta_sentiment_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nâœ… Model baÅŸarÄ±yla kaydedildi: {model_save_path}")


Device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 | Loss: 0.4064
Epoch 2/3 | Loss: 0.2423
Epoch 3/3 | Loss: 0.2089

ðŸ§¾ Classification Report:
              precision    recall  f1-score   support

    Negative       0.89      0.85      0.87      1095
     Neutral       0.97      1.00      0.99      1157
    Positive       0.88      0.88      0.88      1143

    accuracy                           0.91      3395
   macro avg       0.91      0.91      0.91      3395
weighted avg       0.91      0.91      0.91      3395


ðŸ“Š Confusion Matrix:
[[ 936   18  141]
 [   0 1156    1]
 [ 118   14 1011]]

âœ… Model baÅŸarÄ±yla kaydedildi: /content/drive/MyDrive/Dataset/xlm_roberta_sentiment_model


In [None]:
!pip install gradio transformers --quiet
import gradio as gr
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer


model_path = "/content/drive/MyDrive/xlm_roberta_sentiment_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()


id2label = {0:"Negative", 1:"Neutral", 2:"Positive"}


def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return id2label[pred]

demo = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=2, placeholder="Bir metin girin..."),
    outputs="text",
    title="Sentiment Analysis (Roberta)",
    description="Bu model Roberta kullanarak metni Pozitif, Negatif veya NÃ¶tr olarak sÄ±nÄ±flandÄ±rÄ±r."
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://28742b524698b4439c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


