In [1]:
import pandas as pd
import numpy as np
import nltk
import re 
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from nltk.util import pr
from nltk.corpus import stopwords

In [2]:
#Veri setimizi cekelim
data = pd.read_csv("/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
#Bizim icin kelime koklerini bulacak olan fonksiyon
stemmer = nltk.SnowballStemmer("english")

In [4]:
#Labellari sayisal verilere cevirelim
data["labels"] = data["class"].map({0:"Hate Speech", 1:"Offensive Language", 2: "Normal"})
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,Normal
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Language
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Language
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Language


In [5]:
#Veri setinin projemizle ilgili olmayan kisimlarini temizleyelim
data=data[["tweet","labels"]]
data.head()

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,Normal
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Language
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Language
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Language


In [6]:
#Metin on isleme fonksiyonu
def clean(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('https?://\S+|www\.\S+','',text)
    text=re.sub('<.*?>+','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\n','',text)
    text=re.sub('\w*\d\w*','',text)
    all_stopwords=nltk.corpus.stopwords.words('english')
    text=[word for word in text.split(' ') if word not in all_stopwords]
    text=" ".join(text)
    text=[stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [7]:
#Verimizin tumune on isleme adimlarini uygulayalim
data["tweet"]=data["tweet"].apply(clean)

In [8]:
# x = tweetler, y = labellar
x = np.array(data["tweet"])
y = np.array(data["labels"])

# Text verimizi sayisal degerlere donusturecek fonksiyonumuzu yazalim
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Classifier olarak Decision Tree tanimlayalim
clf = DecisionTreeClassifier()

In [9]:
# Modelimizi fit edelim
clf.fit(X_train, y_train)

In [10]:
# Agacimizin derinligi
clf.get_depth()

362

In [11]:
# Agacimizin yaprak adedi
clf.get_n_leaves()

1994

In [12]:
# Kendi ornegimizi yapalim
speech = "damn you"
data = cv.transform([speech]).toarray()
clf.predict(data)

array(['Offensive Language'], dtype=object)

In [13]:
speech = "I am happy"
data = cv.transform([speech]).toarray()
clf.predict(data)

array(['Normal'], dtype=object)

In [14]:
speech = "kill yourself"
data = cv.transform([speech]).toarray()
clf.predict(data)

array(['Hate Speech'], dtype=object)

In [15]:
# Modelimizin skorlarini elde edelim

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = clf.predict(X_test)
accuracy = clf.score(X, y, sample_weight=None)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred, average=None)
print("Precision:", precision[2])

recall = recall_score(y_test, y_pred, average=None)
print("Recall (Sensitivity):", recall[2])

f1 = f1_score(y_test, y_pred, average=None)
print("F1-Score:", f1[2])

Accuracy: 0.9594883589557358
Precision: 0.9215288611544462
Recall (Sensitivity): 0.9324388318863457
F1-Score: 0.9269517457826599
