In [None]:
!pip -q uninstall -y \
  tensorflow tensorflow-cpu tensorflow-gpu keras tf-keras tensorflow-text \
  tensorflow-decision-forests keras-hub dopamine-rl jax jaxlib dm-tree


[0m

In [None]:
# TF 2.19 ekosistemi birbiriyle uyumlu çalışır
!pip -q install "tensorflow==2.19.*" "tf-keras==2.19.*" "tensorflow-text==2.19.*"
# sklearn'i güncellemek istersen (opsiyonel)
!pip -q install -U scikit-learn


In [None]:
import os, json, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Sabitler
SEED = 42
tf.keras.utils.set_random_seed(SEED)

TEXT_COL = "clean_text"
TARGET_COL = "main_label"

MAX_WORDS = 20000
MAXLEN    = 40
EMB_DIM   = 128
LSTM_UNITS= 64
BATCH     = 256
EPOCHS    = 15



In [24]:
import pandas as pd

df = pd.read_csv("/content/clean_cv_data.csv")
print(df.shape)
df.head()


(40001, 3)


Unnamed: 0,clean_text,main_label,sub_label
0,Jitesh Vishwakarma,meta,others
1,E-mail-Id: - jvishwakarma123@gmail.com,meta,others
2,Contact Number: - 9960902548,meta,others
3,PROFESSIONAL SUMMARY:,header,experience
4,· 4 years of technical experience in implement...,content,experience


In [22]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
print(df[TARGET_COL].value_counts())
df["len"] = df[TEXT_COL].astype(str).str.split().apply(len)
print("Ortalama uzunluk:", round(df["len"].mean(), 2), "| 95.percentile:", np.percentile(df["len"], 95))


main_label
content    27341
meta        7311
header      5349
Name: count, dtype: int64
Ortalama uzunluk: 6.52 | 95.percentile: 19.0


In [26]:
le = LabelEncoder()
y = le.fit_transform(df[TARGET_COL].astype(str))

X_train, X_temp, y_train, y_temp = train_test_split(
    df[TEXT_COL].astype(str), y, test_size=0.2, random_state=SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp
)

print("Sınıf sayısı:", len(le.classes_), "→", le.classes_)



Sınıf sayısı: 3 → ['content' 'header' 'meta']


In [27]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

def to_pad(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAXLEN, padding="post", truncating="post")

X_train_pad = to_pad(X_train)
X_val_pad   = to_pad(X_val)
X_test_pad  = to_pad(X_test)

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)
print("Vocab size:", vocab_size)

vocab_size


Vocab size: 17890


17890

In [28]:
cls_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = {i:w for i,w in enumerate(cls_weights)}
print("Class weights:", class_weight_dict)


Class weights: {0: np.float64(0.48768593026091195), 1: np.float64(2.492794266573187), 2: np.float64(1.8236735624323246)}


In [29]:
cls_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = {i:w for i,w in enumerate(cls_weights)}
print("Class weights:", class_weight_dict)


Class weights: {0: np.float64(0.48768593026091195), 1: np.float64(2.492794266573187), 2: np.float64(1.8236735624323246)}


In [30]:
chk_path = "/content/bilstm_best.keras"
callbacks = [
    EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, verbose=1),
    ModelCheckpoint(chk_path, monitor="val_accuracy", save_best_only=True, verbose=1)
]


In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=EMB_DIM, input_length=MAXLEN),
    Bidirectional(LSTM(LSTM_UNITS, return_sequences=False)),
    Dropout(0.4),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(len(le.classes_), activation="softmax")
])

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()




In [32]:
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=EPOCHS,
    batch_size=BATCH,
    class_weight=class_weight_dict,
    callbacks=callbacks,
    verbose=2
)


Epoch 1/15

Epoch 1: val_accuracy improved from None to 0.84400, saving model to /content/bilstm_best.keras
125/125 - 9s - 71ms/step - accuracy: 0.7251 - loss: 0.7028 - val_accuracy: 0.8440 - val_loss: 0.4466 - learning_rate: 1.0000e-03
Epoch 2/15

Epoch 2: val_accuracy improved from 0.84400 to 0.85300, saving model to /content/bilstm_best.keras
125/125 - 2s - 13ms/step - accuracy: 0.8748 - loss: 0.3900 - val_accuracy: 0.8530 - val_loss: 0.4135 - learning_rate: 1.0000e-03
Epoch 3/15

Epoch 3: val_accuracy did not improve from 0.85300
125/125 - 3s - 21ms/step - accuracy: 0.8985 - loss: 0.3078 - val_accuracy: 0.8418 - val_loss: 0.4421 - learning_rate: 1.0000e-03
Epoch 4/15

Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 4: val_accuracy did not improve from 0.85300
125/125 - 1s - 11ms/step - accuracy: 0.9105 - loss: 0.2641 - val_accuracy: 0.8530 - val_loss: 0.4160 - learning_rate: 1.0000e-03
Epoch 5/15

Epoch 5: val_accuracy improved from 0.85300 to 0.8

In [33]:
y_pred_proba = model.predict(X_test_pad, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average="macro")

print("Test Accuracy:", round(acc, 4))
print("Macro F1     :", round(macro_f1, 4))
print()
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Test Accuracy: 0.8648
Macro F1     : 0.8149

              precision    recall  f1-score   support

     content       0.94      0.89      0.92      2735
      header       0.69      0.88      0.77       535
        meta       0.75      0.76      0.75       731

    accuracy                           0.86      4001
   macro avg       0.79      0.84      0.81      4001
weighted avg       0.87      0.86      0.87      4001

Confusion Matrix:
 [[2434  133  168]
 [  46  469   20]
 [  99   75  557]]


In [34]:
import pickle
SAVE_DIR = "/content/lstm_artifacts"
os.makedirs(SAVE_DIR, exist_ok=True)

with open(os.path.join(SAVE_DIR, "tokenizer.pkl"), "wb") as f:
    pickle.dump(tokenizer, f)

with open(os.path.join(SAVE_DIR, "label_encoder.pkl"), "wb") as f:
    pickle.dump(le, f)

model.save(os.path.join(SAVE_DIR, "bilstm_model.keras"))

print("Kaydedildi:", SAVE_DIR)



Kaydedildi: /content/lstm_artifacts


In [35]:
def predict_texts(texts, topk=1):
    seq = to_pad(pd.Series(texts).astype(str))
    proba = model.predict(seq, verbose=0)
    idx = np.argsort(-proba, axis=1)[:, :topk]
    labels = [[le.classes_[i] for i in row] for row in idx]
    scores = [[float(proba[r, i]) for i in row] for r, row in enumerate(idx)]
    return list(zip(labels, scores))

# örnek kullanım
samples = [
    "education bachelor of science in computer engineering",
    "skills python java sql machine learning keras",
    "interests reading traveling football"
]
predict_texts(samples, topk=2)



[(['content', 'meta'], [0.6854285597801208, 0.2061707228422165]),
 (['content', 'header'], [0.93573397397995, 0.0533885732293129]),
 (['meta', 'content'], [0.8880570530891418, 0.09289528429508209])]