<a href="https://colab.research.google.com/github/usugunawan15/hatespeech-detection/blob/main/hate_speech_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hate Speech Detection**
Usu Gunawan | DS0223

**Import Library**

In [135]:
! pip3 install nltk
! pip3 install Sastrawi
import Sastrawi
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [136]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

In [137]:
import requests
import io

response = requests.get('https://raw.githubusercontent.com/usugunawan15/hatespeech-detection/main/dataset_hatespeech.txt')
data = io.StringIO(response.text)

In [138]:
df = pd.read_csv(data,sep='\t')
df.head()

Unnamed: 0,Label,Tweet
0,Non_HS,RT @spardaxyz: Fadli Zon Minta Mendagri Segera Menonaktifkan Ahok Jadi Gubernur DKI https:\/\/t.co\/KH5vIRwPdO
1,Non_HS,RT @baguscondromowo: Mereka terus melukai aksi dalam rangka memenjarakan Ahok atau Ahok gagal dalam Pilkada.
2,Non_HS,Sylvi: bagaimana gurbernur melakukan kekerasan perempuan? Buktinya banyak ibu2 mau foto bareng #DebatFinalPilkadaJKT
3,Non_HS,"Ahmad Dhani Tak Puas Debat Pilkada, Masalah Jalan di Bekasi Belum Sempat Terungkap https:\/\/t.co\/m3HopPLUID\u2026"
4,Non_HS,RT @lisdaulay28: Waspada KTP palsu.....kawal PILKADA https:\/\/t.co\/OOoERQV4SM


In [139]:
df['Label'] = df['Label'].map({
    "Non_HS" : 0,
    "HS" : 1
})
df.head()

Unnamed: 0,Label,Tweet
0,0,RT @spardaxyz: Fadli Zon Minta Mendagri Segera Menonaktifkan Ahok Jadi Gubernur DKI https:\/\/t.co\/KH5vIRwPdO
1,0,RT @baguscondromowo: Mereka terus melukai aksi dalam rangka memenjarakan Ahok atau Ahok gagal dalam Pilkada.
2,0,Sylvi: bagaimana gurbernur melakukan kekerasan perempuan? Buktinya banyak ibu2 mau foto bareng #DebatFinalPilkadaJKT
3,0,"Ahmad Dhani Tak Puas Debat Pilkada, Masalah Jalan di Bekasi Belum Sempat Terungkap https:\/\/t.co\/m3HopPLUID\u2026"
4,0,RT @lisdaulay28: Waspada KTP palsu.....kawal PILKADA https:\/\/t.co\/OOoERQV4SM


In [140]:
df['Label'].value_counts()

0    453
1    260
Name: Label, dtype: int64

**Text Cleaning**

In [141]:
#Melakukan penghapusan data yang tidak relevan
casefold = []
for i in df['Tweet'] :
  try :
    removed = i.replace("RT", "") # Menghapus retweet
    removed = re.sub(r'@[A-Za-z0-9_]+', '', removed) # Menghapus username twitter
    removed = re.sub(r'https?:\\[^ ]+', '', removed) #Menghapus alamat website
    removed = re.sub(r'[\W+|_]+', ' ', removed).lower() #menghapus karakter spesial kecuali spasi

    casefold.append(removed)
  except :
    continue

In [142]:
clean = pd.DataFrame({'clean_text':casefold})
clean.head()

Unnamed: 0,clean_text
0,fadli zon minta mendagri segera menonaktifkan ahok jadi gubernur dki
1,mereka terus melukai aksi dalam rangka memenjarakan ahok atau ahok gagal dalam pilkada
2,sylvi bagaimana gurbernur melakukan kekerasan perempuan buktinya banyak ibu2 mau foto bareng debatfinalpilkadajkt
3,ahmad dhani tak puas debat pilkada masalah jalan di bekasi belum sempat terungkap
4,waspada ktp palsu kawal pilkada


**Stemming**

In [143]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# factory = StopWordRemoverFactory()
# stopword = factory.create_stop_word_remover()
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stemm = []
for i in casefold :
  hasil = stemmer.stem(i)
  stemm.append(hasil)

In [144]:
stem =pd.DataFrame({'stem': stemm})
stem.head()

Unnamed: 0,stem
0,fadli zon minta mendagri segera nonaktif ahok jadi gubernur dki
1,mereka terus luka aksi dalam rangka penjara ahok atau ahok gagal dalam pilkada
2,sylvi bagaimana gurbernur laku keras perempuan bukti banyak ibu2 mau foto bareng debatfinalpilkadajkt
3,ahmad dhani tak puas debat pilkada masalah jalan di bekas belum sempat ungkap
4,waspada ktp palsu kawal pilkada


**Tokenization**

In [145]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
clean_text = []
listStopword =  set(stopwords.words('indonesian'))
tokenized = []
for i in stemm :
  tokens = nltk.tokenize.word_tokenize(i)
  tokenized.append(tokens)

In [146]:
token = pd.DataFrame({'tokenized':tokenized})
token.head()

Unnamed: 0,tokenized
0,"[fadli, zon, minta, mendagri, segera, nonaktif, ahok, jadi, gubernur, dki]"
1,"[mereka, terus, luka, aksi, dalam, rangka, penjara, ahok, atau, ahok, gagal, dalam, pilkada]"
2,"[sylvi, bagaimana, gurbernur, laku, keras, perempuan, bukti, banyak, ibu2, mau, foto, bareng, debatfinalpilkadajkt]"
3,"[ahmad, dhani, tak, puas, debat, pilkada, masalah, jalan, di, bekas, belum, sempat, ungkap]"
4,"[waspada, ktp, palsu, kawal, pilkada]"


**Remove Stop Words**

In [147]:
def remove_stopwords(txt_tokenized):
    txt_clear = [w for w in txt_tokenized if not w in listStopword]
    return txt_clear

df['no_sw'] = token['tokenized'].apply(lambda x: remove_stopwords(x))

In [148]:
df

Unnamed: 0,Label,Tweet,no_sw
0,0,RT @spardaxyz: Fadli Zon Minta Mendagri Segera Menonaktifkan Ahok Jadi Gubernur DKI https:\/\/t.co\/KH5vIRwPdO,"[fadli, zon, mendagri, nonaktif, ahok, gubernur, dki]"
1,0,RT @baguscondromowo: Mereka terus melukai aksi dalam rangka memenjarakan Ahok atau Ahok gagal dalam Pilkada.,"[luka, aksi, rangka, penjara, ahok, ahok, gagal, pilkada]"
2,0,Sylvi: bagaimana gurbernur melakukan kekerasan perempuan? Buktinya banyak ibu2 mau foto bareng #DebatFinalPilkadaJKT,"[sylvi, gurbernur, laku, keras, perempuan, bukti, ibu2, foto, bareng, debatfinalpilkadajkt]"
3,0,"Ahmad Dhani Tak Puas Debat Pilkada, Masalah Jalan di Bekasi Belum Sempat Terungkap https:\/\/t.co\/m3HopPLUID\u2026","[ahmad, dhani, puas, debat, pilkada, jalan, bekas]"
4,0,RT @lisdaulay28: Waspada KTP palsu.....kawal PILKADA https:\/\/t.co\/OOoERQV4SM,"[waspada, ktp, palsu, kawal, pilkada]"
...,...,...,...
708,1,Muka Si BABi Ahok Tuh Yg Mirip SERBET Lantai....@basuki_btp...,"[muka, si, babi, ahok, tuh, yg, serbet, lantai]"
709,1,"Betul bang hancurkan merka bang, musnahkan china babi dibumi pertiwi indonesia, berkedok reklamasi itu ahok","[bang, hancur, merka, bang, musnah, china, babi, bumi, pertiwi, indonesia, kedok, reklamasi, ahok]"
710,1,"Sapa Yg bilang Ahok anti korupsi!?, klo grombolannyA NGOMONG gtu wajar, AHOK ITU MAFIA KORUPTOR SANG PENISTA AGAMA YG DILINDUNGI ISTANA PKI","[sapa, yg, bilang, ahok, anti, korupsi, klo, grombolannya, ngomong, gtu, wajar, ahok, mafia, koruptor, sang, nista, agama, yg, lindung, istana, pki]"
711,1,"Gw juga ngimpi SENTILIN BIJI BABI AHOK, pcetar Pcetar, langsung kejang2 die, ngadu ke Jkw, Jkw: makanye sunat dulu, bru NGOMONG Almaidah.","[gw, ngimpi, sentilin, biji, babi, ahok, pcetar, pcetar, langsung, kejang2, die, ngadu, jkw, jkw, makanye, sunat, bru, ngomong, almaidah]"


**Clean Text**

In [150]:
clean_text = []
for kata in df['no_sw']:
  joined = ' '.join(kata)
  clean_text.append(joined)

In [127]:
clean_text = pd.DataFrame({'clean':clean_text})
clean_text.head()

Unnamed: 0,clean
0,fadli zon mendagri nonaktif ahok gubernur dki
1,luka aksi rangka penjara ahok ahok gagal pilkada
2,sylvi gurbernur laku keras perempuan bukti ibu2 foto bareng debatfinalpilkadajkt
3,ahmad dhani puas debat pilkada jalan bekas
4,waspada ktp palsu kawal pilkada


**Split Data**

In [153]:
df_train, df_test = train_test_split(df, test_size=0.2)

**Feature Extraction**

In [154]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(df['Tweet'])
Train_X_Tfidf = Tfidf_vect.transform(df_train['Tweet'])
Test_X_Tfidf = Tfidf_vect.transform(df_test['Tweet'])
y_train = df_train['Label'].to_numpy()
y_test = df_test['Label'].to_numpy()

**Klasifikasi**

In [155]:
def klasifikasi(df):
  classifier = SVC(kernel=df, random_state=1)
  classifier.fit(Train_X_Tfidf, y_train)

  y_pred = classifier.predict(Test_X_Tfidf)
 
  print('result = ')
  print('akurasi  =',accuracy_score(y_test, y_pred)*100,'%')

  print('presisi  =',precision_score(y_test, y_pred, average='macro')*100,'%')

  print('recall   =',recall_score(y_test, y_pred, average='macro')*100,'%')

  print('f1 score =',f1_score(y_test, y_pred, average='macro')*100,'%')

In [156]:
klasifikasi('sigmoid')

result = 
akurasi  = 86.01398601398601 %
presisi  = 85.82018173122907 %
recall   = 82.5227963525836 %
f1 score = 83.78684807256236 %


In [44]:
klasifikasi('rbf')

result = 
akurasi  = 78.32167832167832 %
presisi  = 85.32608695652173 %
recall   = 70.6043956043956 %
f1 score = 71.9873617693523 %


In [45]:
klasifikasi('linear')

result = 
akurasi  = 84.61538461538461 %
presisi  = 86.95482866043614 %
recall   = 80.08241758241759 %
f1 score = 81.94444444444444 %


In [157]:
classifier = SVC(kernel="linear", random_state=1)
classifier.fit(Train_X_Tfidf, y_train)
y_pred = classifier.predict(Test_X_Tfidf)

In [159]:
df_pred = df_test.copy()
df_pred['prediksi'] = y_pred
df_pred['true_prediction'] = np.where(df_pred['prediksi'] == df_test['Label'], True, False)
df_pred.to_excel("prediksi.xlsx")

In [92]:
pd.set_option('display.max_colwidth', -1)
df_diff = df_pred.loc[df_pred['true_prediction'] == True]
display(df_diff)

Unnamed: 0,Label,Tweet,no_sw,prediksi,true_prediction
407,0,Pak saya setuju kalo pak jadi gubenur karena kerja pak sangat bagus kerena kerja pak basuki tegas keluarga kami mendukung pak basuki,"[tuju, kalo, gubenur, kerja, bagus, rena, kerja, basuki, keluarga, dukung, basuki]",Non_HS,True
93,0,Gimana ya Tim Ahok kok makin disudutkan(?) Mungkin karena ini final kali y,"[gimana, ya, tim, ahok, sudut, final, kali, y]",Non_HS,True
315,0,"Terimakasih Pak @basukibtp untuk 5 tahunnya, 3 tahun sbg wakil gubernur & 2 tahun sbg gubernur!","[terimakasih, 5, 3, sbg, wakil, gubernur, 2, sbg, gubernur]",Non_HS,True
605,1,"@AHMADDHANIPRAST aku katakan anjing kpd ahmad dani, boleh....aku katakan babi kpd ahmad dhani, boleh...","[anjing, kpd, ahmad, babi, kpd, ahmad, dhani]",HS,True
331,0,Semangat terus. Maju terus di dalam TUHAN. God Bless you more and forever bpk ahok dan family,"[semangat, maju, tuhan, god, bless, you, more, and, forever, bpk, ahok, family]",Non_HS,True
...,...,...,...,...,...
322,0,Terima kasih pak ahok pak djarot kami bangga perna di pimpin sama bapak berdua...tetap semangat...sukses terus pak,"[terima, kasih, ahok, djarot, bangga, perna, pimpin, semangat, sukses]",Non_HS,True
401,0,"Bapak ahok sosok yang saya banggakan, terimakasih pak sudah merubah jakarta.","[ahok, sosok, bangga, terimakasih, rubah, jakarta]",Non_HS,True
255,0,19 April adalah saat yang penting bagi warga Jakarta untuk menentukan masa depan Ibukota,"[19, april, warga, jakarta, ibukota]",Non_HS,True
562,1,@budimandjatmiko ahok si kutil babi kembali ke rumah lembang kenapa jakarta jadi banjir bud?,"[ahok, si, kutil, babi, rumah, lembang, jakarta, banjir, bud]",HS,True


**Analyze**

In [160]:
df_result = df_pred.copy()

In [161]:
df_result['temp_list'] = df_result['Tweet'].apply(lambda x:str(x).split())

In [162]:
HS_sent = df_result[df_result['Label']== 0]
NON_HS_sent = df_result[df_result['Label']== 1]

In [164]:
from collections import Counter

#MosT common hatespeech words
top = Counter([item for sublist in HS_sent['temp_list'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(10))
temp_positive.columns = ['Common_words','count']
temp_positive.style.background_gradient(cmap='Greens')

Unnamed: 0,Common_words,count
0,dan,22
1,di,19
2,RT,19
3,yg,18
4,Ahok,17
5,Pak,15
6,#DebatFinalPilkadaJKT,14
7,pak,12
8,untuk,11
9,yang,10


In [166]:
#MosT common non HS words
top = Counter([item for sublist in NON_HS_sent['temp_list'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(10))
temp_negative = temp_negative.iloc[1:,:]
temp_negative.columns = ['Common_words','count']
temp_negative.style.background_gradient(cmap='Reds')

Unnamed: 0,Common_words,count
1,penista,10
2,yg,9
3,itu,8
4,RT,7
5,dan,7
6,gak,6
7,Ahok,6
8,si,6
9,agama,6


In [167]:
raw_text = [word for word_list in df_result['temp_list'] for word in word_list]

In [168]:
def words_unique(Label,numwords,raw_words):

    allother = []
    for item in df_result[df_result.Label != Label]['temp_list']:
        for word in item:
            allother .append(word)
    allother  = list(set(allother ))
    
    specificnonly = [x for x in raw_text if x not in allother]
    
    mycounter = Counter()
    
    for item in df_result[df_result.Label == Label]['temp_list']:
        for word in item:
            mycounter[word] += 1
    keep = list(specificnonly)
    
    for word in list(mycounter):
        if word not in keep:
            del mycounter[word]
    
    Unique_words = pd.DataFrame(mycounter.most_common(numwords), columns = ['words','count'])
    
    return Unique_words

In [171]:
Unique_Positive= words_unique(1, 10, raw_text)
print("The top 20 unique words in Hatespeech Tweets are:")
Unique_Positive.style.background_gradient(cmap='Greens')

The top 20 unique words in Hatespeech Tweets are:


Unnamed: 0,words,count
0,babi,5
1,loh,4
2,kpd,4
3,kafir,4
4,tau,3
5,bisanya,3
6,tuh,3
7,partai,3
8,SANGAT,3
9,DAN,3


In [170]:
Unique_Negative= words_unique(0, 10, raw_text)
print("The top 10 unique words in Non Hatespeech Tweets are:")
Unique_Negative.style.background_gradient(cmap='Reds')

The top 10 unique words in Non Hatespeech Tweets are:


Unnamed: 0,words,count
0,Pak,15
1,pak,12
2,saya,8
3,#MataNajwaDebatJakarta,7
4,DKI,6
5,3,6
6,sudah,6
7,Jakarta,6
8,Jangan,5
9,Pilkada,5
