In [20]:
import pandas as pd

# D1.txt dosyasından spam mesajlarını oku
with open('spam/D1.txt', 'r', encoding='utf-8') as file:
    d1_data = file.readlines()

# D2.txt dosyasından etiket isimlerini oku
with open('spam/D2.txt', 'r', encoding='utf-8') as file:
    d2_data = file.readlines()

# Etiket isimlerini 0 (ham) veya 1 (spam) olarak dönüştür
labels = [0 if label.strip() == 'spam' else 1 for label in d2_data]

# Yeni DataFrame'i oluştur
df = pd.DataFrame({'Mesaj': d1_data, 'Etiket': labels})

# Oluşturulan DataFrame'i spam.xlsx dosyasına kaydet
df.to_excel('spam.xlsx', index=False)


In [21]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# DistilBERT modelini yükle
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name,from_tf=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Metin sınıflandırma pipeline'ini oluştur
classifier_pipeline = pipeline('text-classification', model=model, tokenizer=tokenizer)


import pandas as pd

# spam.xlsx dosyasından veriyi oku
df = pd.read_excel('spam.xlsx')


# Verileri sınıflandır ve sonuçları DataFrame'e ekle
results = classifier_pipeline(df['Mesaj'].tolist())
df['Spam Tahmini'] = [int(result['label'].split('_')[-1]) for result in results]  
df['Spam Skoru'] = [result['score'] for result in results]

# Başarı oranını hesapla
correct_predictions = df[df['Etiket'] == df['Spam Tahmini']].shape[0]
total_samples = df.shape[0]
accuracy = correct_predictions / total_samples * 100

# DataFrame'i ekrana yazdır
print(df[['Mesaj', 'Etiket', 'Spam Tahmini', 'Spam Skoru']])
print(f"\nSpam Sınıflandırma Başarı Oranı: {accuracy:.2f}%")


All TF 2.0 model weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


                                                  Mesaj  Etiket  Spam Tahmini  \
0     Urgent!call09061749602fromLandlineYourcomplime...       0             1   
1     449071512431URGENT!Thisisthe2ndattempttocontac...       0             0   
2     FREEfor1stweek!No1Nokiatone4urmobeveryweekjust...       0             1   
3     Urgent!call09066612661fromlandlineYourcompleme...       0             1   
4     WINNER!!Asavaluednetworkcustomeryouhavebeensel...       0             0   
...                                                 ...     ...           ...   
1319  GreatNews!CallFREEFONE08006344447toclaimyourgu...       0             1   
1320  YouhaveWONaguaranteed�1000cashora�2000prizeToc...       0             0   
1321                 08714712388between10am7pmCost10p\n       0             1   
1322  YES!Theonlyplaceintowntomeetexcitingadultsingl...       0             1   
1323  CongratulationsThankstoagoodfriendUhaveWONthe�...       0             0   

      Spam Skoru  
0       

In [2]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
from sklearn.model_selection import train_test_split

# Veri kümesini yükleyin
df = pd.read_excel('spam.xlsx')

# Veri kümesini eğitim ve test olarak bölün
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Tokenizer ve modeli yükleyin
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)  # 2 sınıf var: spam ve ham

# Verileri tokenleştirin ve DataLoader oluşturun
def tokenize_data(data, tokenizer, max_length=128):
    tokenized_data = tokenizer(data['Mesaj'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    labels = torch.tensor(data['Etiket'].tolist())
    return TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels)

train_dataset = tokenize_data(train_df, tokenizer)
test_dataset = tokenize_data(test_df, tokenizer)

# DataLoader'ları oluşturun
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Modeli eğitin
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3
print("Eğitim Bölümü")
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, attention_mask, labels = batch
        outputs = model(inputs, attention_mask=attention_mask)[0]
        loss = torch.nn.functional.cross_entropy(outputs, labels)
        loss.backward()
        optimizer.step()

# Modelin performansını test edin
model.eval()
correct = 0
total = 0
print("Test Bölümü")
with torch.no_grad():
    for batch in test_loader:
        inputs, attention_mask, labels = batch
        outputs = model(inputs, attention_mask=attention_mask)[0]
        predicted = torch.argmax(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Eğitim Bölümü
Test Bölümü
Test Accuracy: 94.47%
