In [1]:
import pandas as pd
import numpy as np

import re
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [2]:
df = pd.read_json('review_balanced.json')

df = df[df['stars'].isin([1, 5])].reset_index(drop=True)
df['sentiment'] = df['stars'].apply(lambda x: 1 if x == 5 else 0)

TURKISH_STOPWORDS = set([
    "ve", "bir", "bu", "da", "de", "ile", "mi", "çok", "ben", "sen", "o", "biz", "siz", 
    "onlar", "ne", "ya", "ama", "eğer", "çünkü", "ki", "mı", "gibi", "daha", "hem", 
    "veya", "şimdi", "ise", "her", "şu", "için", "hiç", "neden", "sadece", "kadar", 
    "bütün", "herkes", "bazı", "böyle", "diye", "hangi", "nasıl", "nerede", "zaman", 
    "var", "yok", "oldu", "olacak", "olsun", "olmaz"
])

def preprocess_text(text):
    text = text.str.lower().replace("ı", "i").replace("ğ", "g").replace("ü", "u").replace("ş", "s").replace("ç", "c").replace("ö", "o")
    
    text = text.str.replace(r'<[^>]*>', '', regex=True)
    
    text = text.str.replace(r'[^a-zA-Zçğıöşü\s]', '', regex=True)
    
    text = text.apply(lambda x: ' '.join([word for word in x.split() if word not in TURKISH_STOPWORDS]))
    
    return text

df['text'] = preprocess_text(df['text'])

In [3]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, df['sentiment'], test_size=0.2, random_state=42
)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [4]:
class RNNModel(Sequential):
    def __init__(self):
        super().__init__()
        self.add(Embedding(input_dim=10000, output_dim=64, input_length=100))
        self.add(LSTM(256, return_sequences=True))
        self.add(LSTM(128, return_sequences=True))
        self.add(LSTM(64, return_sequences=False))
        self.add(Dense(1, activation='sigmoid'))
        self.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [5]:
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

model = RNNModel()

history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_test, y_test),
    class_weight=class_weights 
)

Epoch 1/15




[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 113ms/step - accuracy: 0.7619 - loss: 0.5408 - val_accuracy: 0.8066 - val_loss: 0.4668
Epoch 2/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 137ms/step - accuracy: 0.7942 - loss: 0.5042 - val_accuracy: 0.8056 - val_loss: 0.4695
Epoch 3/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 154ms/step - accuracy: 0.7980 - loss: 0.4974 - val_accuracy: 0.8051 - val_loss: 0.4812
Epoch 4/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 136ms/step - accuracy: 0.7887 - loss: 0.5078 - val_accuracy: 0.8035 - val_loss: 0.4856
Epoch 5/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 150ms/step - accuracy: 0.7922 - loss: 0.5011 - val_accuracy: 0.8030 - val_loss: 0.4770
Epoch 6/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 143ms/step - accuracy: 0.7873 - loss: 0.5096 - val_accuracy: 0.7989 - val_loss: 0.4872
Epoch 7/15
[1m242/24

In [7]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - accuracy: 0.9519 - loss: 0.2502
Test Accuracy: 95.40%
