In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove6b100dtxt/glove.6B.100d.txt
/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv


In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("danielwillgeorge/glove6b100dtxt")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/glove6b100dtxt


In [4]:
# -*- coding: utf-8 -*-
# ===============================
# Sentiment140 - CNN & BiLSTM (GloVe)
# Hazırlayan: Elif Nur Yılmaz
# ===============================

import os
import random
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

import tensorflow as tf
tf.random.set_seed(SEED)

# sklearn
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import (
    confusion_matrix, classification_report,
    accuracy_score, precision_recall_fscore_support
)

# keras / tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D,
    Dense, Dropout, Input, Concatenate,
    LSTM, Bidirectional
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# NLP preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


2025-10-12 09:30:18.943170: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760261419.187875      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760261419.262706      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# ========== CONFIG ==========
CSV_PATH = "/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv"
GLOVE_PATH = "/kaggle/input/glove6b100dtxt/glove.6B.100d.txt"

SAMPLE_FOR_DEBUG = False
SAMPLE_SIZE = 100000

MAX_WORDS = 10000
MAX_LEN = 50
EMB_DIM = 100
BATCH_SIZE = 128
EPOCHS_QUICK = 3
EPOCHS_FULL = 10

# ========== LOAD DATA ==========
df = pd.read_csv(CSV_PATH, encoding="ISO-8859-1", header=None)
df.columns = ["label", "time", "date", "query", "username", "text"]
df["label"] = df["label"].replace(4, 1)

if SAMPLE_FOR_DEBUG:
    df = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=SEED)

print(df.head())
print(df["label"].value_counts())


   label        time                          date     query         username  \
0      0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1      0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2      0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3      0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4      0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                                text  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  
label
0    800000
1    800000
Name: count, dtype: int64


In [6]:
def preprocess_for_cnn(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#(\w+)", r"\1", text)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    text = re.sub(r"\bnot\s+(\w+)", r"not_\1", text)
    text = re.sub(r"\s+", " ", text).strip()

    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df["clean_text"] = df["text"].astype(str).apply(preprocess_for_cnn)
X = df["clean_text"].values
y = df["label"].values

X_train_all, X_test, y_train_all, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print("Train:", len(X_train_all), "Test:", len(X_test))


Train: 1280000 Test: 320000


In [7]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_all)

X_train_seq_all = tokenizer.texts_to_sequences(X_train_all)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad_all = pad_sequences(X_train_seq_all, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

word_index = tokenizer.word_index
vocab_size = min(MAX_WORDS, len(word_index)) + 1

print("Vocab size:", vocab_size)
print("Train shape:", X_train_pad_all.shape)


Vocab size: 10001
Train shape: (1280000, 50)


In [8]:
emb_index = {}
with open(GLOVE_PATH, 'r', encoding='utf8', errors='ignore') as f:
    for line in f:
        vals = line.rstrip().split(' ')
        word = vals[0]
        coefs = np.asarray(vals[1:], dtype='float32')
        if coefs.shape[0] != EMB_DIM:
            continue
        emb_index[word] = coefs

print(f"GloVe loaded: {len(emb_index):,} vectors")

embedding_matrix = np.zeros((vocab_size, EMB_DIM))
for word, i in word_index.items():
    if i >= vocab_size:
        continue
    vec = emb_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec

print("Embedding matrix shape:", embedding_matrix.shape)


GloVe loaded: 400,000 vectors
Embedding matrix shape: (10001, 100)


In [9]:
val_split = 0.1
val_count = int(len(X_train_pad_all) * val_split)
X_val = X_train_pad_all[:val_count]
y_val = y_train_all[:val_count]
X_train_main = X_train_pad_all[val_count:]
y_train_main = y_train_all[val_count:]

train_ds = tf.data.Dataset.from_tensor_slices((X_train_main, y_train_main)).shuffle(10000, seed=SEED).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)


2025-10-12 09:38:58.247779: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam

# LSTM modeli
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=EMB_DIM, 
              input_length=MAX_LEN, weights=[embedding_matrix], trainable=False),
    LSTM(128, return_sequences=False),  # Tek yönlü LSTM
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary sınıflandırma
])

lstm_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print(lstm_model.summary())




None


In [11]:
from tensorflow.keras.layers import Bidirectional

# BiLSTM modeli
bilstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=EMB_DIM, 
              input_length=MAX_LEN, weights=[embedding_matrix], trainable=False),
    Bidirectional(LSTM(128, return_sequences=False)),  # İleri + Geri okuma
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary sınıflandırma
])

bilstm_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print(bilstm_model.summary())


None


In [12]:
# Eğitim parametreleri
EPOCHS = 5  # küçük test için, tam eğitimde artırabilirsin
BATCH_SIZE = 128

# LSTM eğitimi
hist_lstm = lstm_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# BiLSTM eğitimi
hist_bilstm = bilstm_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)


Epoch 1/5
[1m9000/9000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m842s[0m 93ms/step - accuracy: 0.7334 - loss: 0.5331 - val_accuracy: 0.7732 - val_loss: 0.4702 - learning_rate: 0.0010
Epoch 2/5
[1m9000/9000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m830s[0m 92ms/step - accuracy: 0.7777 - loss: 0.4708 - val_accuracy: 0.7808 - val_loss: 0.4572 - learning_rate: 0.0010
Epoch 3/5
[1m9000/9000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m834s[0m 93ms/step - accuracy: 0.7878 - loss: 0.4535 - val_accuracy: 0.7877 - val_loss: 0.4473 - learning_rate: 0.0010
Epoch 4/5
[1m9000/9000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m834s[0m 93ms/step - accuracy: 0.7941 - loss: 0.4423 - val_accuracy: 0.7882 - val_loss: 0.4466 - learning_rate: 0.0010
Epoch 5/5
[1m9000/9000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m834s[0m 93ms/step - accuracy: 0.7990 - loss: 0.4338 - val_accuracy: 0.7882 - val_loss: 0.4456 - learning_rate: 0.0010
Restoring model weights from the end of the best e

In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import numpy as np

def evaluate_model(model, X_test_pad, y_test):
    y_pred_prob = model.predict(X_test_pad, batch_size=BATCH_SIZE).ravel()
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    
    return acc, prec, rec, f1

# Test
print("=== LSTM MODEL ===")
evaluate_model(lstm_model, X_test_pad, y_test)

print("\n=== BiLSTM MODEL ===")
evaluate_model(bilstm_model, X_test_pad, y_test)


=== LSTM MODEL ===
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 39ms/step
Accuracy: 0.7892
Precision: 0.7676
Recall: 0.8296
F1 Score: 0.7974

Classification Report:
               precision    recall  f1-score   support

           0     0.8146    0.7488    0.7803    160000
           1     0.7676    0.8296    0.7974    160000

    accuracy                         0.7892    320000
   macro avg     0.7911    0.7892    0.7888    320000
weighted avg     0.7911    0.7892    0.7888    320000

Confusion Matrix:
 [[119811  40189]
 [ 27272 132728]]

=== BiLSTM MODEL ===
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 54ms/step
Accuracy: 0.7875
Precision: 0.8040
Recall: 0.7603
F1 Score: 0.7815

Classification Report:
               precision    recall  f1-score   support

           0     0.7727    0.8146    0.7931    160000
           1     0.8040    0.7603    0.7815    160000

    accuracy                         0.7875    320000
   macro avg     

(0.787478125, 0.8039905094872083, 0.76031875, 0.7815450243969818)