Import Library

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
import tensorflow as tf

from tensorflow.keras.layers import Dense, Embedding, Activation, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split



Read CSV File

In [2]:
df = pd.read_csv('tweet.csv')
df.head()

Unnamed: 0,Abusive,Tweet
0,1,cowok usaha lacak perhati gue lantas remeh per...
1,1,telat tau edan sarap gue gaul cigax jifla cal ...
2,0,41 kadang pikir percaya tuhan jatuh kali kali ...
3,0,ku tau mata sipit lihat
4,1,kaum cebong kafir lihat dongok dungu haha


Drop Missing Rows

In [3]:
# drop missing rows
df.dropna(axis=0, inplace=True)

Print Lenght of Data

In [4]:
text = df["Tweet"].tolist()
print(len(text))

13121


Make it to Categorical

In [5]:
y = df["Abusive"]
y = to_categorical(y)
print(y)
#0 itu negatif, 1 itu positif

[[0. 1.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [0. 1.]]


Count Data Each Categorical

In [6]:
df["Abusive"].value_counts()

0    8088
1    5033
Name: Abusive, dtype: int64

Do Tokenizer

In [7]:
token = Tokenizer()
token.fit_on_texts(text)

In [8]:
# if you want to print tokenizer word, run code below 
# token.index_word 

Print Lenght of Index of Word

In [9]:
vocab = len(token.index_word)+1
print(vocab)

13268


Test Text to Tokenize Index

In [10]:
x = ['sinting kau ya']
token.texts_to_sequences(x)

[[558, 1035, 8]]

Encode Every Each Tweet Dataset

In [11]:
encode_text = token.texts_to_sequences(text)
# if you want to print every tokenizer tweet
# print(encode_text)

Do Padding Every Encode Tweet Dataset

In [12]:
max_kata = 100
x=pad_sequences(encode_text,maxlen = max_kata, padding="post")
print(x)

[[ 324  161 3546 ...    0    0    0]
 [1908   49  464 ...    0    0    0]
 [3547  598  101 ...    0    0    0]
 ...
 [  66   66  376 ...    0    0    0]
 [ 111 2819  291 ...    0    0    0]
 [ 569  325    8 ...    0    0    0]]


# **80 20 ratio**
Performing learning for 80% data training and 20% data testing.

Split data test and test test

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1, test_size = 0.2, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

Change to Data to Array

In [14]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

Define Model

In [15]:
vec_size = 100

model = tf.keras.Sequential()
model.add(Embedding(vocab,vec_size,input_length=max_kata))
model.add(Conv1D(64,3,activation='relu'))

model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))

model.add(Dense(2,activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1326800   
                                                                 
 conv1d (Conv1D)             (None, 98, 64)            19264     
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 1,346,194
Trainable params: 1,346,194
Non-trainable params: 0
______________________________________________

In [16]:
from keras.metrics import Precision, Recall
model.compile(optimizer="adam",loss="categorical_crossentropy", metrics=['accuracy', Precision(), Recall()])

In [17]:
model.fit(x_train,y_train, epochs=10, validation_data =(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb856d0acd0>

Evaluate and print Accuracy

In [18]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [19]:
# Evaluate model on the test set
loss, accuracy, precision, recall = model.evaluate(x_test, y_test, verbose=0)
# Print metrics
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1_score(precision, recall)))


Accuracy  : 0.9029
Precision : 0.9029
Recall    : 0.9029
F1 Score  : 0.9029


Get Encode of Predict Data

In [20]:
def get_encode(x):
  x = token.texts_to_sequences(x)
  x = pad_sequences(x,maxlen = max_kata, padding = "post")
  return x

Get Sentiment Classesof Predict Data

In [21]:
def get_sentiment_classes(x):
  x = get_encode(x)
  predict_x=model.predict(x) 
  classes_x=np.argmax(predict_x,axis=1)
  sentiment_classes = ['tidak kasar','kasar']
  print('kata tersebut mengandung konotasi',sentiment_classes[classes_x[0]])

Predict Data 1

In [22]:
# untuk melakukan prediksi kata yang tidak kasar 
get_sentiment_classes(['ibu peri hari ini cantik banget ya'])

kata tersebut mengandung konotasi tidak kasar


Predict Data 2

In [23]:
# untuk melakukan prediksi kata yang kasar
get_sentiment_classes(['bangsat cok raimu koyok asu'])

kata tersebut mengandung konotasi kasar


# **70 30 ratio**
Performing learning for 70% data training and 30% data testing. 


Split data test and test test

In [24]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1, test_size = 0.2, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

Change to Data to Array

In [25]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

Define Model

In [26]:
vec_size = 100

model = tf.keras.Sequential()
model.add(Embedding(vocab,vec_size,input_length=max_kata))
model.add(Conv1D(64,3,activation='relu'))

model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))

model.add(Dense(2,activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          1326800   
                                                                 
 conv1d_1 (Conv1D)           (None, 98, 64)            19264     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 2)                 130       
                                                                 
Total params: 1,346,194
Trainable params: 1,346,194
Non-trainable params: 0
____________________________________________

In [27]:
from keras.metrics import Precision, Recall
model.compile(optimizer="adam",loss="categorical_crossentropy", metrics=['accuracy', Precision(), Recall()])

In [28]:
model.fit(x_train,y_train, epochs=10, validation_data =(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb854125a90>

Evaluate and print Accuracy

In [29]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [30]:
# Evaluate model on the test set
loss, accuracy, precision, recall = model.evaluate(x_test, y_test, verbose=0)
# Print metrics
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1_score(precision, recall)))


Accuracy  : 0.9040
Precision : 0.9040
Recall    : 0.9040
F1 Score  : 0.9040


Get Encode of Predict Data

In [31]:
def get_encode(x):
  x = token.texts_to_sequences(x)
  x = pad_sequences(x,maxlen = max_kata, padding = "post")
  return x

Get Sentiment Classesof Predict Data

In [32]:
def get_sentiment_classes(x):
  x = get_encode(x)
  predict_x=model.predict(x) 
  classes_x=np.argmax(predict_x,axis=1)
  sentiment_classes = ['tidak kasar','kasar']
  print('kata tersebut mengandung konotasi',sentiment_classes[classes_x[0]])

Predict Data 1

In [33]:
# untuk melakukan prediksi kata yang tidak kasar 
get_sentiment_classes(['ibu peri hari ini cantik banget ya'])

kata tersebut mengandung konotasi tidak kasar


Predict Data 2

In [34]:
# untuk melakukan prediksi kata yang kasar
get_sentiment_classes(['bangsat cok raimu koyok asu'])

kata tersebut mengandung konotasi kasar


# **60 40 ratio**
Performing learning for 60% data training and 40% data testing.

Split data test and test test

In [35]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1, test_size = 0.4, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.4, random_state=1)

Change to Data to Array

In [36]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

Define Model

In [37]:
vec_size = 100

model = tf.keras.Sequential()
model.add(Embedding(vocab,vec_size,input_length=max_kata))
model.add(Conv1D(64,3,activation='relu'))

model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))

model.add(Dense(2,activation='softmax'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          1326800   
                                                                 
 conv1d_2 (Conv1D)           (None, 98, 64)            19264     
                                                                 
 global_max_pooling1d_2 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 2)                 130       
                                                                 
Total params: 1,346,194
Trainable params: 1,346,194
Non-trainable params: 0
____________________________________________

In [38]:
from keras.metrics import Precision, Recall
model.compile(optimizer="adam",loss="categorical_crossentropy", metrics=['accuracy', Precision(), Recall()])

In [39]:
model.fit(x_train,y_train, epochs=10, validation_data =(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb853457550>

Evaluate and print Accuracy

In [40]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [41]:
# Evaluate model on the test set
loss, accuracy, precision, recall = model.evaluate(x_test, y_test, verbose=0)
# Print metrics
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1_score(precision, recall)))


Accuracy  : 0.8985
Precision : 0.8985
Recall    : 0.8985
F1 Score  : 0.8985


Get Encode of Predict Data

In [42]:
def get_encode(x):
  x = token.texts_to_sequences(x)
  x = pad_sequences(x,maxlen = max_kata, padding = "post")
  return x

Get Sentiment Classesof Predict Data

In [43]:
def get_sentiment_classes(x):
  x = get_encode(x)
  predict_x=model.predict(x) 
  classes_x=np.argmax(predict_x,axis=1)
  sentiment_classes = ['tidak kasar','kasar']
  print('kata tersebut mengandung konotasi',sentiment_classes[classes_x[0]])

Predict Data 1

In [44]:
# untuk melakukan prediksi kata yang tidak kasar 
get_sentiment_classes(['ibu peri hari ini cantik banget ya'])

kata tersebut mengandung konotasi kasar


Predict Data 2

In [45]:
# untuk melakukan prediksi kata yang kasar
get_sentiment_classes(['bangsat cok raimu koyok asu'])

kata tersebut mengandung konotasi kasar
