In [1]:
# !pip install spacy
!pip install --upgrade tensorflow==1.15
# !python m spacy download en_core_web_lg



In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report

In [5]:
#Insert data to dict

df = {
    'Data' : ['I feel like I am drowning. #depression #anxiety #failure #worthless',
              '#panic Panic attack from fear of starting new medication',
              "My bus was in a car crash... I'm still shaking a bit... This week was an absolute horror and this was the icing on the cake...#terrible",
              'Just got back from seeing @GaryDelaney in Burslem.AMAZING!! Face still hurts from laughing so much #hilarious',
              "It's the #FirstDayofFall and I'm so happy. Sipping my #PumpkinSpice flavoured coffee and #smiling! Happy Fall everyone! #amwriting",
              'Morning all! Of course it is sunny on this Monday morning to cheerfully welcome us back to work.:)'],
      
    'Label' : ['fear','fear','fear','joy','joy','joy']
}

In [6]:
#Dict to dataframe

df = pd.DataFrame.from_dict(df)
df

Unnamed: 0,Data,Label
0,I feel like I am drowning. #depression #anxiet...,fear
1,#panic Panic attack from fear of starting new ...,fear
2,My bus was in a car crash... I'm still shaking...,fear
3,Just got back from seeing @GaryDelaney in Burs...,joy
4,It's the #FirstDayofFall and I'm so happy. Sip...,joy
5,Morning all! Of course it is sunny on this Mon...,joy


### **Data Preprocessing**

In [7]:
import re
import string
#Data preprocessing

def get_clean(input_str):
  input_str = input_str.lower() #Convert text to lowercase
  input_str = re.sub(r'\d+', '', input_str) #Remove numbers
  input_str = input_str.strip() #Remove whitespaces
  input_str = re.sub(r'[^\w\s]','',input_str) #Remove punctuation
  return input_str
df['Data'] = df['Data'].apply(lambda x: get_clean(x))
df['Data']

0    i feel like i am drowning depression anxiety f...
1    panic panic attack from fear of starting new m...
2    my bus was in a car crash im still shaking a b...
3    just got back from seeing garydelaney in bursl...
4    its the firstdayoffall and im so happy sipping...
5    morning all of course it is sunny on this mond...
Name: Data, dtype: object

In [8]:
#Removing stop words
#Stop words are the words that are most common used (for example in english is (a,the,am,are, etc.))
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
def clean_stopwords(input_str):
  return " ".join([word for word in str(input_str).split() if word not in stopwords])
  
df['Data'] = df['Data'].apply(lambda x: clean_stopwords(x))
print(df['Data'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0    feel like drowning depression anxiety failure ...
1      panic panic attack fear starting new medication
2    bus car crash im still shaking bit week absolu...
3    got back seeing garydelaney burslemamazing fac...
4    firstdayoffall im happy sipping pumpkinspice f...
5    morning course sunny monday morning cheerfully...
Name: Data, dtype: object


In [9]:
#Word Tokenizing
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r"\w+")
df['Data'] = df['Data'].apply(tokenizer.tokenize)
print(df['Data'])

0    [feel, like, drowning, depression, anxiety, fa...
1    [panic, panic, attack, fear, starting, new, me...
2    [bus, car, crash, im, still, shaking, bit, wee...
3    [got, back, seeing, garydelaney, burslemamazin...
4    [firstdayoffall, im, happy, sipping, pumpkinsp...
5    [morning, course, sunny, monday, morning, chee...
Name: Data, dtype: object


In [10]:
#Word Stemming from nltk library
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return text
    
df['Data']= df['Data'].apply(lambda x: stemming_on_text(x))
df['Data'].head()

0    [feel, like, drown, depress, anxieti, failur, ...
1      [panic, panic, attack, fear, start, new, medic]
2    [bu, car, crash, im, still, shake, bit, week, ...
3    [got, back, see, garydelaney, burslemamaz, fac...
4    [firstdayoffal, im, happi, sip, pumpkinspic, f...
Name: Data, dtype: object

In [11]:
np.asarray(df['Data'][5]).shape

(10,)

### **Word2Vec**

In [12]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

# model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
# model.save("word2vec.model")
# vector = model.wv['computer']  # get numpy vector of a word

model = Word2Vec(sentences = df['Data'], min_count = 1)
model.save("word2vec.model")
test = model.wv['absolut']
test.shape

(100,)

In [13]:
#Display Saved word in model
words = list(model.wv.vocab)
words

['feel',
 'like',
 'drown',
 'depress',
 'anxieti',
 'failur',
 'worthless',
 'panic',
 'attack',
 'fear',
 'start',
 'new',
 'medic',
 'bu',
 'car',
 'crash',
 'im',
 'still',
 'shake',
 'bit',
 'week',
 'absolut',
 'horror',
 'ice',
 'caketerr',
 'got',
 'back',
 'see',
 'garydelaney',
 'burslemamaz',
 'face',
 'hurt',
 'laugh',
 'much',
 'hilari',
 'firstdayoffal',
 'happi',
 'sip',
 'pumpkinspic',
 'flavour',
 'coffe',
 'smile',
 'fall',
 'everyon',
 'amwrit',
 'morn',
 'cours',
 'sunni',
 'monday',
 'cheer',
 'welcom',
 'us',
 'work']

In [14]:
def get_vector(word):
  vector = model.wv[word]
  return vector

In [15]:
df['Vec'] = df['Data'].apply(lambda x: get_vector(x))
df['Vec'][5].shape

(10, 100)

In [16]:
data = np.asarray(df['Data'])
len(data[5])

10

In [17]:
#Checking shape -> 12 words and every words have 100 vectors 

vec = np.asarray(df['Vec'])
vec[2].shape

(12, 100)

In [18]:
#Saving Unique Words

data = np.concatenate(data, axis=0)
data.shape

(59,)

In [19]:
unique_data = []
for word in data:
  if word not in unique_data:
    unique_data.append(word)

unique_data.sort()
len(unique_data)

53

In [20]:
#Test

print(unique_data[0])
print(get_vector('absolut'))

absolut
[ 7.3665427e-04  2.9562765e-03  2.4638257e-03 -3.9702067e-03
  4.5946129e-03  1.8884202e-03 -5.9660041e-04 -2.3222216e-03
  1.1366222e-03 -1.4209604e-03 -2.8019601e-03 -1.6894558e-03
 -1.9876771e-03 -2.2878964e-03  1.3562836e-03  2.8597664e-03
  2.8208760e-03 -1.1556908e-03 -4.4842493e-03  1.4949493e-03
  2.5785873e-03  2.4665825e-04 -1.9986723e-03 -3.4684292e-04
  2.6360676e-03 -2.0788135e-03 -1.0706028e-03 -1.2174406e-03
 -3.2019867e-03 -7.2720030e-04  1.0882072e-03 -1.0610069e-03
  7.7589863e-04  3.2234404e-03  8.9568624e-05  2.9428580e-03
 -4.9179583e-03  1.2663639e-03  4.9468321e-03  4.5471922e-03
  2.8727839e-03 -1.0132897e-03  1.3074636e-03 -5.8119098e-04
  4.4467677e-03  1.3963005e-03 -1.5611028e-03 -2.9251303e-03
  4.3848398e-04 -3.2612714e-03 -1.0903620e-03 -2.0899863e-03
  1.4434468e-03  2.6875108e-03  3.7389162e-03  1.5185140e-03
 -4.2830482e-03 -4.2191902e-03 -4.5163413e-03  2.6211288e-04
  1.3018927e-03  4.1641728e-03  1.7365714e-03 -1.3530446e-03
  4.2863186e-03 

In [21]:
#Length of unique words

np.asarray(unique_data).shape

(53,)

### **Training**

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer

#Convert vector into matrix
seq = Tokenizer(nb_words = 500, split='')
seq.fit_on_texts(df['Data'])



In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequence = seq.texts_to_sequences(df['Data'])
sequence = pad_sequences(sequence, maxlen=12, padding='post', truncating='post')
print(sequence)

[[ 7  8  9 10 11 12 13  0  0  0  0  0]
 [ 1  1 14 15 16 17 18  0  0  0  0  0]
 [19 20 21  2  3 22 23 24 25 26 27 28]
 [29  4 30 31 32 33  3 34 35 36 37  0]
 [38  2  5 39 40 41 42 43  5 44 45 46]
 [ 6 47 48 49  6 50 51 52  4 53  0  0]]


In [24]:
#words must be changed to a list to fit the matrix
matrix = np.zeros((53,100))

for i in range (len(words)) :
    matrix[i] = model[words[i]]

  """


In [42]:
matrix[0]

array([-1.4260252e-03, -3.3456672e-03,  2.1479698e-03, -3.8539576e-03,
        3.6561023e-03, -2.4044935e-03,  2.7421091e-03,  1.5145054e-03,
        4.8989281e-03,  2.3729953e-03, -3.4177792e-03,  2.3020108e-03,
       -2.2908410e-03, -5.0000451e-04,  8.3262438e-04,  1.7055906e-03,
        3.4298578e-03, -6.1959022e-04,  2.3239164e-03,  3.1632229e-03,
       -2.4807705e-03, -5.7033938e-04,  4.0350705e-03, -2.0759259e-03,
        4.3828185e-03, -1.3512889e-03, -3.4341309e-03, -2.7345696e-03,
       -6.7239854e-04, -1.2932652e-04,  2.6036014e-03, -4.7495943e-03,
        3.9427471e-03, -1.0017054e-03, -8.6219603e-04,  1.5032404e-03,
        2.3588231e-03, -1.4815860e-03, -2.0398106e-03,  2.4969450e-03,
        4.7780033e-03, -3.8032359e-04,  2.1443160e-03,  3.1246110e-03,
       -1.9796146e-03, -3.6620526e-03, -3.7659300e-03, -2.5886218e-03,
        3.6550788e-03,  4.4156364e-04, -1.8596480e-03, -2.6805573e-03,
       -2.3094506e-03, -1.9999323e-04,  1.9131053e-03, -2.4462012e-03,
      

In [25]:
y = df['Label']
y = y.values.reshape(-1,1)
y

array([['fear'],
       ['fear'],
       ['fear'],
       ['joy'],
       ['joy'],
       ['joy']], dtype=object)

In [26]:
X = sequence
X

array([[ 7,  8,  9, 10, 11, 12, 13,  0,  0,  0,  0,  0],
       [ 1,  1, 14, 15, 16, 17, 18,  0,  0,  0,  0,  0],
       [19, 20, 21,  2,  3, 22, 23, 24, 25, 26, 27, 28],
       [29,  4, 30, 31, 32, 33,  3, 34, 35, 36, 37,  0],
       [38,  2,  5, 39, 40, 41, 42, 43,  5, 44, 45, 46],
       [ 6, 47, 48, 49,  6, 50, 51, 52,  4, 53,  0,  0]], dtype=int32)

In [27]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
#Normalization -> x_data already normalized, we must change label from string to int so that the program can read it

scaler = StandardScaler()
X = scaler.fit_transform(X)
encoder = OneHotEncoder()
y = encoder.fit_transform(y.reshape(-1,1)).toarray()
print(y.shape)
print(X.shape)

(6, 2)
(6, 12)


In [28]:
X

array([[-0.72545355, -0.34993946, -0.83996873, -0.86189827, -0.51651492,
        -1.27785269, -0.72450595, -1.27940554, -0.85029809, -1.29732176,
        -0.96129498, -0.67767386],
       [-1.17573506, -0.78221762, -0.4947761 , -0.56123608, -0.14757569,
        -0.90566259, -0.42262847, -1.27940554, -0.85029809, -1.29732176,
        -0.96129498, -0.67767386],
       [ 0.17510948,  0.39110881, -0.01150642, -1.34295776, -1.10681768,
        -0.53347248, -0.12075099, -0.07525915,  0.99817601, -0.02447777,
         0.46741866,  0.86082896],
       [ 0.92557867, -0.59695555,  0.60984031,  0.40088291,  1.03302983,
         0.28534575, -1.32826091,  0.42646851,  1.73756565,  0.46507761,
         0.99657186, -0.67767386],
       [ 1.60100094, -0.7204636 , -1.11612283,  0.88194241,  1.62333259,
         0.88084991,  1.02638343,  0.87802341, -0.48060327,  0.85672191,
         1.41989443,  1.84986649],
       [-0.80050047,  2.05846742,  1.85253377,  1.48326678, -0.88545414,
         1.5507921 ,  

In [29]:
from sklearn.model_selection import train_test_split
#Split data into data training and testing

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=2)

In [30]:
x_train.shape, y_train.shape

((4, 12), (4, 2))

In [31]:
matrix = np.float32(matrix)

In [32]:
import tensorflow as tf
#initialize variable (Weight, Bias)
layer = {
    'input' : 12,
    'hidden' : 53,
    'output' : 2,
    'embed' : 100
}

weight = {
    # th = to hidden, to = to output
    'th' : tf.Variable(tf.random_normal([layer['input'], layer['hidden']])),
    'to' : tf.Variable(tf.random_normal([layer['embed'], layer['output']])),
    'em' : tf.Variable(matrix)
}

bias = {
    'th' : tf.Variable(tf.random_normal([layer['hidden']])),
    'to' : tf.Variable(tf.random_normal([layer['output']])),
    'hth': tf.Variable(tf.random_normal([layer['embed']]))
}

In [33]:
x = tf.placeholder(tf.float32, [None, layer['input']])
target = tf.placeholder(tf.float32, [None, layer['output']])

print(x)
print(target)

Tensor("Placeholder:0", shape=(?, 12), dtype=float32)
Tensor("Placeholder_1:0", shape=(?, 2), dtype=float32)


In [34]:
#function forward pass
def forward_pass():
    wx_b1 = tf.matmul(x, weight['th']) + bias['th']
    y1 = tf.nn.sigmoid(wx_b1)

    wx_b2 = tf.matmul(y1, weight['em']) + bias['hth']
    y2 = tf.nn.sigmoid(wx_b2)

    wx_b3 = tf.matmul(y2, weight['to']) + bias['to']
    y3 = tf.nn.sigmoid(wx_b3)

    return y3

In [35]:
# isi value prediction
y = forward_pass()

In [37]:
# variable pembantu dalam training dan testing
epoch = 500
alpha = 0.1

# MSE
error = tf.reduce_mean(0.5 * (target - y)**2)
optimizer = tf.train.GradientDescentOptimizer(alpha)
train = optimizer.minimize(error)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [40]:
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  best_error = float('inf')
  for i in range(epoch+1):
    sess.run(
        train,
        feed_dict = {
            x: x_train,
            target: y_train
        }
    )

    if i % 25 == 0:
      current_error = sess.run(
          error,
          feed_dict = {
              x: x_train,
              target: y_train
          }
      )
      print(f'EPOCH : {i} | ERROR : {current_error} |')

  true_prediction = tf.equal(tf.argmax(y, axis = 1), tf.argmax(target, axis = 1))
  accuracy = tf.reduce_mean(tf.cast(true_prediction, tf.float32))
  accuracy = sess.run(
      accuracy,
      feed_dict = {
          x: x_test,
          target: y_test
      }
  )
  evaluation = y.eval(feed_dict = {
      x: x_test
  })  
  print(f'ACCURACY: {accuracy:}')

EPOCH : 0 | ERROR : 0.12677158415317535 |
EPOCH : 25 | ERROR : 0.04720999300479889 |
EPOCH : 50 | ERROR : 0.0251912958920002 |
EPOCH : 75 | ERROR : 0.01611074060201645 |
EPOCH : 100 | ERROR : 0.01146466564387083 |
EPOCH : 125 | ERROR : 0.00873558409512043 |
EPOCH : 150 | ERROR : 0.006974741816520691 |
EPOCH : 175 | ERROR : 0.005759688559919596 |
EPOCH : 200 | ERROR : 0.004878186620771885 |
EPOCH : 225 | ERROR : 0.004213489126414061 |
EPOCH : 250 | ERROR : 0.0036966949701309204 |
EPOCH : 275 | ERROR : 0.003284806152805686 |
EPOCH : 300 | ERROR : 0.00294973561540246 |
EPOCH : 325 | ERROR : 0.0026724322233349085 |
EPOCH : 350 | ERROR : 0.0024395582731813192 |
EPOCH : 375 | ERROR : 0.0022415267303586006 |
EPOCH : 400 | ERROR : 0.0020712758414447308 |
EPOCH : 425 | ERROR : 0.0019235007930547 |
EPOCH : 450 | ERROR : 0.0017941489350050688 |
EPOCH : 475 | ERROR : 0.0016800636658445 |
EPOCH : 500 | ERROR : 0.0015787668526172638 |
ACCURACY: 1.0


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, evaluation, target_names=target_names))