In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
!pip -q show scikit-learn

import numpy as np, pandas as pd, sklearn, warnings
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, confusion_matrix
warnings.filterwarnings("ignore")

# Deterministik davranış
np.random.seed(42)
print("sklearn version:", sklearn.__version__)


sklearn version: 1.6.1


In [11]:
import pandas as pd

CSV_PATH = "/content/clean_cv_data.csv"

df = None
try:
    df = pd.read_csv(CSV_PATH)
    print("✅ CSV yüklendi:", CSV_PATH)
except Exception as e:
    print("❌ CSV okunamadı:", e)

assert df is not None, "Dosya bulunamadı. Lütfen dosya yolunu kontrol et."
print("\nVeri şekli (satır, sütun):", df.shape)
df.head(3)


✅ CSV yüklendi: /content/clean_cv_data.csv

Veri şekli (satır, sütun): (40001, 3)


Unnamed: 0,clean_text,main_label,sub_label
0,Jitesh Vishwakarma,meta,others
1,E-mail-Id: - jvishwakarma123@gmail.com,meta,others
2,Contact Number: - 9960902548,meta,others


In [12]:
expected_text_col = "clean_text"
expected_label_col = "main_label"

missing = [c for c in [expected_text_col, expected_label_col] if c not in df.columns]
assert not missing, f"Beklenen sütun(lar) eksik: {missing}. Lütfen datasetini kontrol et."

# Tip ve NA temizliği
df = df[[expected_text_col, expected_label_col]].copy()
df[expected_text_col] = df[expected_text_col].astype(str).fillna("")
df[expected_label_col] = df[expected_label_col].astype(str)
df = df.dropna().reset_index(drop=True)

print("Sütunlar:", df.columns.tolist())
print("Örnek satır sayısı:", len(df))
df[expected_label_col].value_counts()


Sütunlar: ['clean_text', 'main_label']
Örnek satır sayısı: 40001


Unnamed: 0_level_0,count
main_label,Unnamed: 1_level_1
content,27341
meta,7311
header,5349


In [13]:
X = df[expected_text_col]
y = df[expected_label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print(f"Eğitim Seti: {len(X_train)} | Test Seti: {len(X_test)}")
y_train.value_counts(normalize=True).rename("train_ratio").to_frame().join(
    y_test.value_counts(normalize=True).rename("test_ratio"),
    how="outer"
).fillna(0).sort_index()


Eğitim Seti: 32000 | Test Seti: 8001


Unnamed: 0_level_0,train_ratio,test_ratio
main_label,Unnamed: 1_level_1,Unnamed: 2_level_1
content,0.6835,0.68354
header,0.133719,0.133733
meta,0.182781,0.182727


In [14]:
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=5
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

print("Vocabulary size:", X_train_tfidf.shape[1])
print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Test  TF-IDF shape:", X_test_tfidf.shape)


Vocabulary size: 5788
Train TF-IDF shape: (32000, 5788)
Test  TF-IDF shape: (8001, 5788)


In [15]:
nb_model = MultinomialNB()  # alpha=1.0 (default)
print("Model eğitiliyor...")
nb_model.fit(X_train_tfidf, y_train)
print("Eğitim tamam.")

y_pred = nb_model.predict(X_test_tfidf)

print("\n6. Naive Bayes Sonuçları (Test Seti Üzerinde):")
print("-" * 50)
print(classification_report(y_test, y_pred))
print(f"Genel (Macro Average) F1-Score: {f1_score(y_test, y_pred, average='macro'):.4f}")
print("-" * 50)

labels_in_model_order = nb_model.classes_
cm = confusion_matrix(y_test, y_pred, labels=labels_in_model_order)
print("Sınıf sırası (model):", labels_in_model_order.tolist())
print("Karışıklık Matrisi (ham sayılar):\n", cm)


Model eğitiliyor...
Eğitim tamam.

6. Naive Bayes Sonuçları (Test Seti Üzerinde):
--------------------------------------------------
              precision    recall  f1-score   support

     content       0.87      0.96      0.91      5469
      header       0.86      0.70      0.77      1070
        meta       0.81      0.63      0.71      1462

    accuracy                           0.86      8001
   macro avg       0.85      0.76      0.80      8001
weighted avg       0.86      0.86      0.86      8001

Genel (Macro Average) F1-Score: 0.7979
--------------------------------------------------
Sınıf sırası (model): ['content', 'header', 'meta']
Karışıklık Matrisi (ham sayılar):
 [[5231   85  153]
 [ 249  751   70]
 [ 505   36  921]]


In [16]:
import pandas as pd
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{c}" for c in labels_in_model_order],
    columns=[f"pred_{c}" for c in labels_in_model_order]
)
cm_df


Unnamed: 0,pred_content,pred_header,pred_meta
true_content,5231,85,153
true_header,249,751,70
true_meta,505,36,921


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("nb", MultinomialNB())
])

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__min_df": [3, 5],
    "nb__alpha": [0.5, 1.0, 1.5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs = GridSearchCV(pipe, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, verbose=0)
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("CV f1_macro (best):", gs.best_score_)

best = gs.best_estimator_
y_pred_gs = best.predict(X_test)
print("\nGridSearch Sonuçları (Test):")
print("-" * 50)
print(classification_report(y_test, y_pred_gs))
print(f"Macro F1:", f1_score(y_test, y_pred_gs, average='macro'))


Best params: {'nb__alpha': 0.5, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 1)}
CV f1_macro (best): 0.8009072139817546

GridSearch Sonuçları (Test):
--------------------------------------------------
              precision    recall  f1-score   support

     content       0.89      0.95      0.92      5469
      header       0.85      0.74      0.79      1070
        meta       0.81      0.66      0.73      1462

    accuracy                           0.87      8001
   macro avg       0.85      0.78      0.81      8001
weighted avg       0.87      0.87      0.87      8001

Macro F1: 0.8124912957072832


In [18]:
import numpy as np
feat_names = np.array(tfidf.get_feature_names_out())
for i, cls in enumerate(nb_model.classes_):
    topk = 20
    idx = nb_model.feature_log_prob_[i].argsort()[::-1][:topk]
    print(f"\n[{cls}] için en ayırt edici {topk} n-gram:")
    print(", ".join(feat_names[idx]))



[content] için en ayırt edici 20 n-gram:
client, university, team, management, board, college, year, project, data, company, duration, com, process, school, role, organization, designation, sap, business, experience

[header] için en ayırt edici 20 n-gram:
experience, project, responsibilities, personal, objective, skills, details, qualification, profile, summary, declaration, education, personal details, professional, work, work experience, career, academic, resume, date

[meta] için en ayırt edici 20 n-gram:
com, date, gmail, gmail com, 91, place, indian, mobile, email, address, birth, father, date birth, nationality, mail, english, male, status, nationality indian, contact


In [19]:
import joblib, os
SAVE_DIR = "/content/drive/MyDrive/nb_tfidf_model"
os.makedirs(SAVE_DIR, exist_ok=True)
joblib.dump(nb_model, os.path.join(SAVE_DIR, "multinomial_nb.joblib"))
joblib.dump(tfidf, os.path.join(SAVE_DIR, "tfidf_vectorizer.joblib"))
print("Kaydedildi:", SAVE_DIR)


Kaydedildi: /content/drive/MyDrive/nb_tfidf_model
