Import Library

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
import tensorflow as tf

from tensorflow.keras.layers import Dense, Embedding, Activation, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split



Read CSV File

In [14]:
df = pd.read_csv('tweet.csv')
df.head()

Unnamed: 0,Abusive,Tweet
0,1,cowok usaha lacak perhati gue lantas remeh per...
1,1,telat tau edan sarap gue gaul cigax jifla cal ...
2,0,41 kadang pikir percaya tuhan jatuh kali kali ...
3,0,ku tau mata sipit lihat
4,1,kaum cebong kafir lihat dongok dungu haha


Drop Missing Rows

In [15]:
# drop missing rows
df.dropna(axis=0, inplace=True)

Print Lenght of Data

In [16]:
text = df["Tweet"].tolist()
print(len(text))

13121


Make it to Categorical

In [17]:
y = df["Abusive"]
y = to_categorical(y)
print(y)
#0 itu negatif, 1 itu positif

[[0. 1.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [0. 1.]]


Count Data Each Categorical

In [7]:
df["Abusive"].value_counts()

0    8088
1    5033
Name: Abusive, dtype: int64

Do Tokenizer

In [19]:
token = Tokenizer()
token.fit_on_texts(text)

In [18]:
# if you want to print every index word
# token.index_word

In [10]:
vocab = len(token.index_word)+1
print(vocab)

13268


Test Text to Tokenize Index

In [11]:
x = ['sinting kau ya']
token.texts_to_sequences(x)

[[558, 1035, 8]]

Encode Every Each Tweet Dataset

In [12]:
encode_text = token.texts_to_sequences(text)
# if you want to print every tokenizer tweet
# print(encode_text)

Do Padding Every Encode Tweet Dataset

In [None]:
max_kata = 100
x=pad_sequences(encode_text,maxlen = max_kata, padding="post")
print(x)

[[ 324  161 3546 ...    0    0    0]
 [1908   49  464 ...    0    0    0]
 [3547  598  101 ...    0    0    0]
 ...
 [  66   66  376 ...    0    0    0]
 [ 111 2819  291 ...    0    0    0]
 [ 569  325    8 ...    0    0    0]]


# **80 20 ratio**
Performing learning for 80% data training and 20% data testing.

Split data test and test test

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1, test_size = 0.2, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

Change to Data to Array

In [None]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

Define Model

In [None]:
# define model
vec_size=300
model = tf.keras.Sequential()
model.add(Embedding(vocab,vec_size,input_length=max_kata))
model.add(Conv1D(64,8,activation="relu"))
model.add(MaxPooling1D(2))
model.add(Dropout(0.5))

model.add(Dense(32,activation="relu"))
model.add(Dropout(0.5))

model.add(Dense(16,activation="relu"))
model.add(GlobalMaxPooling1D())

model.add(Dense(2,activation="softmax"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 300)          3980400   
_________________________________________________________________
conv1d (Conv1D)              (None, 93, 64)            153664    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 46, 64)            0         
_________________________________________________________________
dropout (Dropout)            (None, 46, 64)            0         
_________________________________________________________________
dense (Dense)                (None, 46, 32)            2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 46, 32)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 46, 16)            5

In [None]:
from keras.metrics import Precision, Recall
model.compile(optimizer="adam",loss="categorical_crossentropy", metrics=['accuracy', Precision(), Recall()])

In [None]:
model.fit(x_train,y_train, epochs=10, validation_data =(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9fb8b34dd0>

Evaluate and print Accuracy

In [None]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
# Evaluate model on the test set
loss, accuracy, precision, recall = model.evaluate(x_test, y_test, verbose=0)
# Print metrics
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1_score(precision, recall)))


Accuracy  : 0.8971
Precision : 0.8971
Recall    : 0.8971
F1 Score  : 0.8971


Get Encode of Predict Data

In [None]:
def get_encode(x):
  x = token.texts_to_sequences(x)
  x = pad_sequences(x,maxlen = max_kata, padding = "post")
  return x

Get Sentiment Classesof Predict Data

In [None]:
def get_sentiment_classes(x):
  x = get_encode(x)
  predict_x=model.predict(x) 
  classes_x=np.argmax(predict_x,axis=1)
  sentiment_classes = ['tidak kasar','kasar']
  print('kata tersebut mengandung konotasi',sentiment_classes[classes_x[0]])

Predict Data 1

In [None]:
# untuk melakukan prediksi kata yang tidak kasar 
get_sentiment_classes(['ibu peri hari ini cantik banget ya'])

kata tersebut mengandung konotasi tidak kasar


Predict Data 2

In [None]:
# untuk melakukan prediksi kata yang kasar
get_sentiment_classes(['woi babi lo anjing'])

kata tersebut mengandung konotasi kasar


# **70 30 ratio**
Performing learning for 70% data training and 30% data testing. 


Split data test and test test

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1, test_size = 0.3, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=1)

Change to Data to Array

In [None]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

Define Model

In [None]:
# define model
vec_size=300
model = tf.keras.Sequential()
model.add(Embedding(vocab,vec_size,input_length=max_kata))
model.add(Conv1D(64,8,activation="relu"))
model.add(MaxPooling1D(2))
model.add(Dropout(0.5))

model.add(Dense(32,activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(16,activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(2,activation="softmax"))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          3980400   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 93, 64)            153664    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 46, 64)            0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 46, 64)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 46, 32)            2080      
_________________________________________________________________
dropout_3 (Dropout)          (None, 46, 32)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 46, 16)           

In [None]:
from keras.metrics import Precision, Recall
model.compile(optimizer="adam",loss="categorical_crossentropy", metrics=['accuracy', Precision(), Recall()])

In [None]:
model.fit(x_train,y_train, epochs=10, validation_data =(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9fb89519d0>

Evaluate and print Accuracy

In [None]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
# Evaluate model on the test set
loss, accuracy, precision, recall = model.evaluate(x_test, y_test, verbose=0)
# Print metrics
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1_score(precision, recall)))


Accuracy  : 0.8984
Precision : 0.8984
Recall    : 0.8984
F1 Score  : 0.8984


Get Encode of Predict Data

In [None]:
def get_encode(x):
  x = token.texts_to_sequences(x)
  x = pad_sequences(x,maxlen = max_kata, padding = "post")
  return x

Get Sentiment Classesof Predict Data

In [None]:
def get_sentiment_classes(x):
  x = get_encode(x)
  predict_x=model.predict(x) 
  classes_x=np.argmax(predict_x,axis=1)
  sentiment_classes = ['tidak kasar','kasar']
  print('kata tersebut mengandung konotasi',sentiment_classes[classes_x[0]])

Predict Data 1

In [None]:
# untuk melakukan prediksi kata yang tidak kasar 
get_sentiment_classes(['ibu peri hari ini cantik banget ya'])

kata tersebut mengandung konotasi tidak kasar


Predict Data 2

In [None]:
# untuk melakukan prediksi kata yang kasar
get_sentiment_classes(['woi babi lo anjing'])

kata tersebut mengandung konotasi kasar


# **60 40 ratio**
Performing learning for 60% data training and 40% data testing.

Split data test and test test

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1, test_size = 0.4, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.4, random_state=1)

Change to Data to Array

In [None]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

Define Model

In [None]:
# define model
vec_size=300
model = tf.keras.Sequential()
model.add(Embedding(vocab,vec_size,input_length=max_kata))
model.add(Conv1D(64,8,activation="relu"))
model.add(MaxPooling1D(2))
model.add(Dropout(0.5))

model.add(Dense(32,activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(16,activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(2,activation="softmax"))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          3980400   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 93, 64)            153664    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 46, 64)            0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 46, 64)            0         
_________________________________________________________________
dense_6 (Dense)              (None, 46, 32)            2080      
_________________________________________________________________
dropout_5 (Dropout)          (None, 46, 32)            0         
_________________________________________________________________
dense_7 (Dense)              (None, 46, 16)           

In [None]:
from keras.metrics import Precision, Recall
model.compile(optimizer="adam",loss="categorical_crossentropy", metrics=['accuracy', Precision(), Recall()])

In [None]:
model.fit(x_train,y_train, epochs=10, validation_data =(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9fb8497c50>

Evaluate and print Accuracy

In [None]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
# Evaluate model on the test set
loss, accuracy, precision, recall = model.evaluate(x_test, y_test, verbose=0)
# Print metrics
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1_score(precision, recall)))


Accuracy  : 0.8964
Precision : 0.8964
Recall    : 0.8964
F1 Score  : 0.8964


Get Encode of Predict Data

In [None]:
def get_encode(x):
  x = token.texts_to_sequences(x)
  x = pad_sequences(x,maxlen = max_kata, padding = "post")
  return x

Get Sentiment Classesof Predict Data

In [None]:
def get_sentiment_classes(x):
  x = get_encode(x)
  predict_x=model.predict(x) 
  classes_x=np.argmax(predict_x,axis=1)
  sentiment_classes = ['tidak kasar','kasar']
  print('kata tersebut mengandung konotasi',sentiment_classes[classes_x[0]])

Predict Data 1

In [None]:
# untuk melakukan prediksi kata yang tidak kasar 
get_sentiment_classes(['ibu peri hari ini cantik banget ya'])

kata tersebut mengandung konotasi tidak kasar


Predict Data 2

In [None]:
# untuk melakukan prediksi kata yang kasar
get_sentiment_classes(['woi babi lo anjing'])

kata tersebut mengandung konotasi kasar
