In [35]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
train = pd.read_csv('yelp/train_en.txt', sep='\t')[['Sentence', 'Style']]
val   = pd.read_csv('yelp/val_en.txt', sep='\t')[['Sentence', 'Style']]
test  = pd.read_csv('yelp/test_en.txt', sep='\t')[['Sentence', 'Style']]

In [5]:
for df in [train, val, test]:
    df['Tokens'] = df['Sentence'].apply(lambda x: word_tokenize(str(x).lower()))

In [6]:
w2v_model = Word2Vec(sentences=train['Tokens'], vector_size=100, window=5, min_count=15, sg=1)

In [7]:
words = w2v_model.wv.index_to_key
word_to_id = {word: i+1 for i, word in enumerate(words)}
UNK_ID = 0

In [8]:
def tokens_to_indices(tokens):
    return [word_to_id.get(word, UNK_ID) for word in tokens]

In [9]:
for df in [train, val, test]:
    df['Tokens_ID'] = df['Tokens'].apply(tokens_to_indices)

In [10]:
max_len = int(np.mean(train['Tokens_ID'].apply(len)))
X_train = pad_sequences(train['Tokens_ID'], maxlen=max_len, padding='post')
X_val   = pad_sequences(val['Tokens_ID'], maxlen=max_len, padding='post')
X_test  = pad_sequences(test['Tokens_ID'], maxlen=max_len, padding='post')

In [11]:
y_train = np.array([1 if label=='positive' else 0 for label in train['Style']])
y_val   = np.array([1 if label=='positive' else 0 for label in val['Style']])
y_test  = np.array([1 if label=='positive' else 0 for label in test['Style']])

In [12]:
embedding_dim = 100
embedding_matrix = np.zeros((len(words)+1, embedding_dim))
for word, i in word_to_id.items():
    embedding_matrix[i] = w2v_model.wv[word]

In [13]:
model = Sequential()
model.add(Embedding(input_dim=len(words)+1, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True, input_length=max_len))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))



In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [15]:
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/5
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m632s[0m 62ms/step - accuracy: 0.8141 - loss: 0.4103 - val_accuracy: 0.8971 - val_loss: 0.2542
Epoch 2/5
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m611s[0m 60ms/step - accuracy: 0.9065 - loss: 0.2331 - val_accuracy: 0.8957 - val_loss: 0.2612
Epoch 3/5
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m625s[0m 61ms/step - accuracy: 0.9216 - loss: 0.1952 - val_accuracy: 0.8989 - val_loss: 0.2566
Epoch 4/5
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 65ms/step - accuracy: 0.9378 - loss: 0.1568 - val_accuracy: 0.8930 - val_loss: 0.2810
Epoch 5/5
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m628s[0m 62ms/step - accuracy: 0.9535 - loss: 0.1209 - val_accuracy: 0.8875 - val_loss: 0.3309


In [16]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 20ms/step


In [17]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

Accuracy: 0.8876679731243001
Precision: 0.9247357431815216
Recall: 0.9187113084627103
F1-score: 0.9217136818144867


In [39]:
model2 = Sequential()
model2.add(Embedding(input_dim=len(words)+1, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True, input_length=max_len))
model2.add(LSTM(256))
model2.add(Dropout(0.3))
model2.add(Dense(1, activation='sigmoid'))



In [41]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model2.summary()

In [56]:
history2 = model2.fit(X_train, y_train, epochs=6, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/6
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1201s[0m 118ms/step - accuracy: 0.8830 - loss: 0.2890 - val_accuracy: 0.8975 - val_loss: 0.2506
Epoch 2/6
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1224s[0m 120ms/step - accuracy: 0.9112 - loss: 0.2204 - val_accuracy: 0.9030 - val_loss: 0.2495
Epoch 3/6
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1234s[0m 121ms/step - accuracy: 0.9289 - loss: 0.1801 - val_accuracy: 0.9017 - val_loss: 0.2638
Epoch 4/6
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1239s[0m 122ms/step - accuracy: 0.9440 - loss: 0.1420 - val_accuracy: 0.8954 - val_loss: 0.2858
Epoch 5/6
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1218s[0m 120ms/step - accuracy: 0.9589 - loss: 0.1068 - val_accuracy: 0.8886 - val_loss: 0.3224
Epoch 6/6
[1m10176/10176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1168s[0m 115ms/step - accuracy: 0.9707 - loss: 0.0769 - val_accuracy: 0.8885

In [57]:
y_pred_prob2 = model2.predict(X_test)
y_pred2 = (y_pred_prob2 > 0.5).astype(int).flatten()

[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 35ms/step


In [58]:
print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2))
print("Recall:", recall_score(y_test, y_pred2))
print("F1-score:", f1_score(y_test, y_pred2))

Accuracy: 0.8881812243374393
Precision: 0.914466569120173
Recall: 0.931805659093119
F1-score: 0.9230546948997448


In [64]:
# Модел 1 со 128 слоја во LSTM и трениран за 5 епохи
# Модел 2 со 256 слоја во LSTM и додаден Dropout слој со веројатност 30% и трениран за 6 епохи
# Резултатите се идентични