## Model 1 ( biLSTM )
- 테스트 정확도: 0.9378

## Model 2 ( CNN )
- 테스트 정확도: 0.9407

In [None]:
#Load packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [None]:
#Load dataset
df = pd.read_csv("gold-dataset-sinha-khandait.csv")

In [None]:
df = df[df["Price Sentiment"] != 'none']

In [None]:
print("Commodity News Headlines")
display(df[["News","Price Sentiment"]])

In [None]:
def cleaner(impure_data):
    temp_list = []
    for item in impure_data:
        #finding words which start with @
        item = re.sub('@\S+', '', item)
        
        #finding words which start with http
        item = re.sub('http\S+\s*', '', item)
        
        #finding special characters, but not "emoji"
        item = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', item)
        temp_list.append(item)
    return temp_list

# **My Code Start**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
headlines = df['News']
polarity = df['Price Sentiment'].tolist()
clean_headline = cleaner(headlines)

labels = []
for i in range(len(polarity)):
  if polarity[i] == 'negative' :
    labels.append(0)
  elif polarity[i] == 'neutral' :
    labels.append(1)
  else :
    labels.append(2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(clean_headline, labels, test_size = 0.3,)

In [None]:
t = Tokenizer()
t.fit_on_texts(X_train)

In [None]:
word_frequency = t.word_counts
word_index = t.word_index
print(len(word_frequency))

4839


In [None]:
number_of_words = len(word_index)
th_num = [0,0]
for threshold in range(2, 11):
  number_less_threshold, freq_less_threshold, total_freq = 0, 0, 0
  for word, freq in word_frequency.items():
    total_freq += freq
    if freq < threshold :
      freq_less_threshold += freq
      number_less_threshold += 1
  th_num.append([threshold, number_less_threshold])
  print("threshold : ", threshold)
  print("frequency proportion of usage less than threshold : ", freq_less_threshold/total_freq)
  print("count proportion less than threshold : ", number_less_threshold/number_of_words)

  # threshold번 이상 쓰이지 않은 단어가 
  # 등장한 빈도 : frequency propotion of usage less than threshold
  # 전체 단어 수 중 차지하는 비율 : count propotion less than threshold

threshold :  2
frequency proportion of usage less than threshold :  0.050070046178591815
count proportion less than threshold :  0.5982641041537508
threshold :  3
frequency proportion of usage less than threshold :  0.06964838547882184
count proportion less than threshold :  0.7152304195081628
threshold :  4
frequency proportion of usage less than threshold :  0.08215292550891576
count proportion less than threshold :  0.7650340979541228
threshold :  5
frequency proportion of usage less than threshold :  0.09211504868641796
count proportion less than threshold :  0.7947923124612524
threshold :  6
frequency proportion of usage less than threshold :  0.1009356785831647
count proportion less than threshold :  0.8158710477371358
threshold :  7
frequency proportion of usage less than threshold :  0.11183175080855774
count proportion less than threshold :  0.8375697458152511
threshold :  8
frequency proportion of usage less than threshold :  0.12042754111970114
count proportion less than thr

In [None]:
print(th_num) # 'th_num' : [threshold, threshold번 미만으로 나온 단어개수] 쌍
#앞의 [0, 0 은 인덱스 자리 채운거

[0, 0, [2, 2895], [3, 3461], [4, 3702], [5, 3846], [6, 3948], [7, 4053], [8, 4124], [9, 4187], [10, 4231]]


In [None]:
threshold, number_less_threshold = th_num[2] #threshold를 2로 선택
vocab_size = number_of_words - number_less_threshold + 2
print(vocab_size) #사용하기로 한 총 단어 개수

1946


In [None]:
tokenizer = Tokenizer(vocab_size, oov_token = 'OOV')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
word_index = tokenizer.word_index

In [None]:
print(X_train[0])
print(y_train[0])
print()
print(X_test[0])
print(y_test[0])

[2, 6, 339, 1, 5, 15, 1139]
2

[2, 80, 16, 42, 58, 86, 709]
0


In [None]:
max_len = max([len(s) for s in X_train]) # text들 중 최대 길이
print(max_len)
X_train = pad_sequences(X_train, maxlen = max_len) # 거기에 맞춰 패딩
X_test = pad_sequences(X_test, maxlen = max_len)

19


In [None]:
y_train[:10]

[2, 0, 0, 0, 2, 2, 0, 2, 0, 0]

In [None]:
print(X_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    2    6
  339    1    5   15 1139]


In [None]:
y_train = to_categorical(y_train) # label을 원핫인코딩
y_test = to_categorical(y_test)
sentiment_index = {0 : 'negative', 1 : 'neutral', 2 : 'positive'}

In [None]:
print(y_train[:5])

[[0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


# **Models**

**===== Model1 - biLSTM ======**

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
embedding_dim = 256

model1 = Sequential()
model1.add((Embedding(vocab_size, embedding_dim))) 
model1.add(Bidirectional(LSTM(32)))
model1.add(Dense(3, activation='softmax'))

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
history = model1.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.88506, saving model to best_model.h5
Epoch 2/15

Epoch 00002: val_acc improved from 0.88506 to 0.92337, saving model to best_model.h5
Epoch 3/15

Epoch 00003: val_acc did not improve from 0.92337
Epoch 4/15

Epoch 00004: val_acc improved from 0.92337 to 0.93027, saving model to best_model.h5
Epoch 5/15

Epoch 00005: val_acc improved from 0.93027 to 0.93333, saving model to best_model.h5
Epoch 6/15

Epoch 00006: val_acc did not improve from 0.93333
Epoch 00006: early stopping


In [None]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.9378


===== Model2 - Conv1D ======

In [None]:
from tensorflow.keras.layers import GlobalMaxPooling1D, Flatten, Concatenate
from tensorflow.keras.layers import Embedding, Conv1D, Dropout, Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
embed_layer = Embedding(vocab_size, 300, input_length=max_len)
dropout = Dropout(0.5)
dense = Dense(3, activation = 'softmax')
filters = [2,3,5]

inputs = Input(shape = (max_len,))
embedding = embed_layer(inputs)
layers_list = []

for size in filters:
    conv_layer = Conv1D(filters = 512, kernel_size = size, padding = "valid", \
                  activation = "relu")(embedding)
    conv_layer = GlobalMaxPooling1D()(conv_layer)
    conv_layer = Flatten()(conv_layer)
    layers_list.append(conv_layer)

# outputs = Concatenate()(layers_list) if len(conv_blocks) > 1 else conv_blocks[0]
outputs = Concatenate()(layers_list)
outputs = dropout(outputs)
outputs = dense(outputs)

model = Model(inputs, outputs)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
history = model.fit(X_train, y_train, epochs=20, callbacks = [es, mc], batch_size=60, validation_split=0.2)

Epoch 1/20

Epoch 00001: val_acc did not improve from 0.94943
Epoch 2/20

Epoch 00002: val_acc did not improve from 0.94943
Epoch 3/20

Epoch 00003: val_acc did not improve from 0.94943
Epoch 4/20

Epoch 00004: val_acc did not improve from 0.94943
Epoch 5/20

Epoch 00005: val_acc did not improve from 0.94943
Epoch 6/20

Epoch 00006: val_acc did not improve from 0.94943
Epoch 7/20

Epoch 00007: val_acc did not improve from 0.94943
Epoch 00007: early stopping


In [None]:
best_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (best_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.9407


# **My Code End**

### Important
* Looking at the confusion matrix, it  is clear that the performance on neutral will be poor. 
* Positive and negative headlines are likely to be identified correctly

### Try sentence transformers to get extraordinary improvement in results

In [None]:
def prediction(model, tokenizer, texts):
  pred = []
  sequences = tokenizer.texts_to_sequences(texts)
  sequences = pad_sequences(sequences, maxlen = max_len)
  prediction = [np.argmax(i) for i in model.predict(sequences)]
  sentiments = [sentiment_index[i] for i in prediction]
  return sentiments

In [None]:
text = ["Gold expected to beat expectations."]
sentiment = prediction(best_model, tokenizer, text)
print(sentiment)

['positive']


In [None]:
text = ["The price of gold continues declining."]
sentiment = prediction(best_model, tokenizer, text)
print(sentiment)

['negative']


In [None]:
text = ["Gold price continues to improve."]
sentiment = prediction(best_model, tokenizer, text)
print(sentiment)

['negative']


In [None]:
text = ["Gold price expected to remain steady."]
sentiment = prediction(best_model, tokenizer, text)
print(sentiment)

['positive']
