TIMOTEUS SINDHIKARA  - 2301869014 - UAS NLP

In [1]:
# !pip install --upgrade tensorflow==1.15
# !pip install gensim==3.8.
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import nltk
import pandas as pd
import tensorflow as tf
import numpy as np
import re
import string
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#Input Data
dataset = {
    'text' : ['I feel like I am drowning. #depression #anxiety #failure #worthless',
              '#panic Panic attack from fear of starting new medication',
              "My bus was in a car crash... I'm still shaking a bit... This week was an absolute horror and this was the icing on the cake...#terrible",
              'Just got back from seeing @GaryDelaney in Burslem.AMAZING!! Face still hurts from laughing so much #hilarious',
              "It's the #FirstDayofFall and I'm so happy. Sipping my #PumpkinSpice flavoured coffee and #smiling! Happy Fall everyone! #amwriting",
              'Morning all! Of course it is sunny on this Monday morning to cheerfully welcome us back to work.:)'],
      
    'label' : ['fear','fear','fear','joy','joy','joy']
}

In [3]:
#Convert dataset type (dictionary) into dataframe
dataset = pd.DataFrame.from_dict(dataset)
dataset.head()

Unnamed: 0,text,label
0,I feel like I am drowning. #depression #anxiet...,fear
1,#panic Panic attack from fear of starting new ...,fear
2,My bus was in a car crash... I'm still shaking...,fear
3,Just got back from seeing @GaryDelaney in Burs...,joy
4,It's the #FirstDayofFall and I'm so happy. Sip...,joy


#Preprocessing

In [4]:
#Clean data, mengubah seluruh kata menjadi lowercase, menghilangkan angka, whitespace, dan punctuation
def clean_dataset(input_str):
  input_str = input_str.lower()
  input_str = re.sub(r'\d+', '', input_str)
  input_str = input_str.strip()
  input_str = re.sub(r'[^\w\s]','',input_str)
  return input_str
dataset['text'] = dataset['text'].apply(lambda x: clean_dataset(x))
dataset['text'].head()

0    i feel like i am drowning depression anxiety f...
1    panic panic attack from fear of starting new m...
2    my bus was in a car crash im still shaking a b...
3    just got back from seeing garydelaney in bursl...
4    its the firstdayoffall and im so happy sipping...
Name: text, dtype: object

In [5]:
# Remove stop words, ini berguna untuk menghilangkan kata-kata 
# tambahan yang tidak berguna untuk proses training model nantinya
stopwords = set(stopwords.words('english'))
def clean_stopwords(input_str):
  return " ".join([word for word in str(input_str).split() if word not in stopwords])
  
dataset['text'] = dataset['text'].apply(lambda x: clean_stopwords(x))
dataset['text'].head()

0    feel like drowning depression anxiety failure ...
1      panic panic attack fear starting new medication
2    bus car crash im still shaking bit week absolu...
3    got back seeing garydelaney burslemamazing fac...
4    firstdayoffall im happy sipping pumpkinspice f...
Name: text, dtype: object

In [6]:
#Word Tokenizing, ini berguna untuk membagi kalimat-kalimat menjadi bagian per kata
tokenizer = RegexpTokenizer(r"\w+")
dataset['text'] = dataset['text'].apply(tokenizer.tokenize)
dataset['text'].head()

0    [feel, like, drowning, depression, anxiety, fa...
1    [panic, panic, attack, fear, starting, new, me...
2    [bus, car, crash, im, still, shaking, bit, wee...
3    [got, back, seeing, garydelaney, burslemamazin...
4    [firstdayoffall, im, happy, sipping, pumpkinsp...
Name: text, dtype: object

In [7]:
# Word Stemming, ini berguna untuk mengubah seluruh kata-kata menjadi kata baku, data siap untuk dilakukan Word Embedding
stemming = nltk.PorterStemmer()
def stem_text(dataset):
    text = [stemming.stem(word) for word in dataset]
    return text

dataset['text']= dataset['text'].apply(lambda x: stem_text(x))
dataset['text'].head()

0    [feel, like, drown, depress, anxieti, failur, ...
1      [panic, panic, attack, fear, start, new, medic]
2    [bu, car, crash, im, still, shake, bit, week, ...
3    [got, back, see, garydelaney, burslemamaz, fac...
4    [firstdayoffal, im, happi, sip, pumpkinspic, f...
Name: text, dtype: object

In [8]:
#View max len per index
for i in range (len(dataset['text'])):
    print(len(dataset['text'][i]))

7
7
12
11
12
10


#Word Embedding

In [9]:
#Convert into vector
embedModel = Word2Vec(dataset['text'], min_count=1, size=10)

In [10]:
#View unique words
vocab = list(embedModel.wv.vocab)
print(vocab)

['feel', 'like', 'drown', 'depress', 'anxieti', 'failur', 'worthless', 'panic', 'attack', 'fear', 'start', 'new', 'medic', 'bu', 'car', 'crash', 'im', 'still', 'shake', 'bit', 'week', 'absolut', 'horror', 'ice', 'caketerr', 'got', 'back', 'see', 'garydelaney', 'burslemamaz', 'face', 'hurt', 'laugh', 'much', 'hilari', 'firstdayoffal', 'happi', 'sip', 'pumpkinspic', 'flavour', 'coffe', 'smile', 'fall', 'everyon', 'amwrit', 'morn', 'cours', 'sunni', 'monday', 'cheer', 'welcom', 'us', 'work']


In [11]:
vocab[1]

'like'

In [12]:
#View Length of the unique words
len(vocab)

53

In [13]:
#Convert vector into matrix
seq = Tokenizer(nb_words = 500, split='')
seq.fit_on_texts(dataset['text'])



In [14]:
newSeq = seq.texts_to_sequences(dataset['text'])
newSeq = pad_sequences(newSeq, maxlen=12, padding='post', truncating='post')
print(newSeq)

[[ 7  8  9 10 11 12 13  0  0  0  0  0]
 [ 1  1 14 15 16 17 18  0  0  0  0  0]
 [19 20 21  2  3 22 23 24 25 26 27 28]
 [29  4 30 31 32 33  3 34 35 36 37  0]
 [38  2  5 39 40 41 42 43  5 44 45 46]
 [ 6 47 48 49  6 50 51 52  4 53  0  0]]


In [15]:
matrix = np.zeros((53,10))

#mengubah menjadi bentuk matrix
for i in range (len(vocab)) :
    matrix[i] = embedModel[vocab[i]]

  """


In [16]:
print(matrix[50])

[-0.0120541  -0.00333323  0.00895239 -0.04198774  0.04922399  0.00624243
  0.04163278  0.01962431 -0.04343016 -0.00306005]


In [17]:
matrix = np.float32(matrix)

#Training

In [18]:
#Data preparation
#X
x_data = newSeq
scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)

#Y
y_data = dataset['label']
y_data = y_data.values.reshape(-1,1)
encoder = OneHotEncoder()
y_data = encoder.fit_transform(y_data).toarray()

In [19]:
x_data.shape

(6, 12)

In [20]:
y_data.shape

(6, 2)

In [21]:
#Initialize Variable (Weight, Bias)
layer = {
    'input' : 12,
    'hidden' : 53,
    'output' : 2,
    'embed' : 10
}

weight = {
    'th' : tf.Variable(tf.random_normal([layer['input'], layer['hidden']])),
    'to' : tf.Variable(tf.random_normal([layer['embed'], layer['output']])),
    'em' : tf.Variable(matrix)
}

bias = {
    'th' : tf.Variable(tf.random_normal([layer['hidden']])),
    'to' : tf.Variable(tf.random_normal([layer['output']])),
    'hth': tf.Variable(tf.random_normal([layer['embed']]))
}

In [22]:
#Data Split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.25)

x = tf.placeholder(tf.float32, [None, layer['input']])
target = tf.placeholder(tf.float32, [None, layer['output']])

In [23]:
#Forward Pass Function
def forward_pass():
    wx_b1 = tf.matmul(x, weight['th']) + bias['th']
    y1 = tf.nn.sigmoid(wx_b1)

    wx_b2 = tf.matmul(y1, weight['em']) + bias['hth']
    y2 = tf.nn.sigmoid(wx_b2)

    wx_b3 = tf.matmul(y2, weight['to']) + bias['to']
    y3 = tf.nn.sigmoid(wx_b3)

    return y3

In [24]:
#Prediction
y = forward_pass()

In [25]:
#Initialize epoch,alpha,error
epoch = 100
alpha = 0.6
error = tf.reduce_mean(0.5 * (target - y)**2)

In [26]:
#Optimizer
optimizer = tf.train.GradientDescentOptimizer(alpha)
train = optimizer.minimize(error)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [27]:
with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())

    for i in range(epoch):
        sess.run(
            train,
            feed_dict = {
                x: x_train,
                target: y_train
            }
        )

        if i % 1 == 0:

            current_error = sess.run(
                error,
                feed_dict = {
                    x: x_train,
                    target: y_train
                }
            )

            true_prediction = tf.equal(tf.argmax(y, axis = 1), tf.argmax(target, axis = 1))
            accuracy = tf.reduce_mean(tf.cast(true_prediction, tf.float32))

            accuracy = sess.run(
                accuracy, feed_dict = {
                    x: x_train,
                    target: y_train
                }
            )

            test_true_prediction = tf.equal(tf.argmax(y, axis = 1), tf.argmax(target, axis = 1))
            accuracy_test = tf.reduce_mean(tf.cast(test_true_prediction, tf.float32))

            accuracy_test = sess.run(
                accuracy_test, feed_dict = {
                    x: x_test,
                    target: y_test
                }
            
            )
            #Training Process
            print(f'EPOCH {i+1} | ACC: {accuracy} | ERROR: {current_error}')


    print(f'\nTrain Accuracy:{accuracy}\n')

    print(f'Test Accuracy:{accuracy_test}\n')

EPOCH 1 | ACC: 0.5 | ERROR: 0.17169111967086792
EPOCH 2 | ACC: 0.5 | ERROR: 0.16488134860992432
EPOCH 3 | ACC: 0.5 | ERROR: 0.1575608253479004
EPOCH 4 | ACC: 0.5 | ERROR: 0.14983706176280975
EPOCH 5 | ACC: 0.5 | ERROR: 0.14201678335666656
EPOCH 6 | ACC: 0.5 | ERROR: 0.1345592737197876
EPOCH 7 | ACC: 0.5 | ERROR: 0.127894788980484
EPOCH 8 | ACC: 0.5 | ERROR: 0.1222490519285202
EPOCH 9 | ACC: 0.5 | ERROR: 0.11761421710252762
EPOCH 10 | ACC: 0.75 | ERROR: 0.11383748799562454
EPOCH 11 | ACC: 0.75 | ERROR: 0.11072136461734772
EPOCH 12 | ACC: 1.0 | ERROR: 0.10808292031288147
EPOCH 13 | ACC: 1.0 | ERROR: 0.10577528178691864
EPOCH 14 | ACC: 1.0 | ERROR: 0.10368894040584564
EPOCH 15 | ACC: 1.0 | ERROR: 0.10174566507339478
EPOCH 16 | ACC: 1.0 | ERROR: 0.09989120811223984
EPOCH 17 | ACC: 1.0 | ERROR: 0.09808893501758575
EPOCH 18 | ACC: 1.0 | ERROR: 0.09631496667861938
EPOCH 19 | ACC: 1.0 | ERROR: 0.09455444663763046
EPOCH 20 | ACC: 1.0 | ERROR: 0.09279896318912506
EPOCH 21 | ACC: 1.0 | ERROR: 0.0