In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import hamming_loss, f1_score, roc_auc_score,accuracy_score, classification_report, ConfusionMatrixDisplay, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from skmultilearn.model_selection import IterativeStratification
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from model3 import ManualLogisticRegressionOneVsRest
from scipy.sparse import hstack
import joblib

In [2]:
DATA_TRAIN_DIR = "../../ML_Approaching/Preprocessing/train_data_preprocessing.csv"
DATA_VAL_DIR = "../../ML_Approaching/Preprocessing/val_data_preprocessing.csv"
DATA_TEST_DIR = "../../ML_Approaching/Preprocessing/test_data_preprocessing.csv"

In [3]:
def load_data(file_path):

    df = pd.read_csv(file_path)
    
    # 3. ƒê·ªãnh nghƒ©a c√°c c·ªôt nh√£n
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    X = df['clean_text'].astype(str)
    
    y = df[label_cols]
    
    return X, y

# --- TH·ª∞C HI·ªÜN LOAD D·ªÆ LI·ªÜU ---
print("ƒêang load d·ªØ li·ªáu...")

X_train, y_train = load_data(DATA_TRAIN_DIR)
X_val, y_val     = load_data(DATA_VAL_DIR)
X_test, y_test   = load_data(DATA_TEST_DIR)

ƒêang load d·ªØ li·ªáu...


In [4]:
tfidf_vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    ngram_range=(1, 2),       # gi·ªØ ng·ªØ c·∫£nh "not good", "you are"
    min_df=3,                 # l·ªçc nhi·ªÖu nh·∫π
    max_df=0.9,               # lo·∫°i t·ª´ qu√° ph·ªï bi·∫øn
    max_features=50000,
)

In [5]:
tfidf_vectorizer.fit(X_train)

In [6]:
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [7]:
tfidf_vectorizer.get_feature_names_out()[:100]

array(['__', '__ __', '__toc__', '_friend', '_noticeboard',
       '_noticeboard incidents', 'aa', 'aa aa', 'aah', 'aaliyah', 'aap',
       'aardvark', 'aaron', 'ab', 'aba', 'abandon', 'abandoned',
       'abandoning', 'abandonment', 'abbas', 'abbey', 'abbreviated',
       'abbreviation', 'abbreviations', 'abc', 'abc news', 'abd',
       'abduction', 'abdul', 'abdullah', 'abe', 'abhira', 'abhishek',
       'abhorrent', 'abide', 'abiding', 'abilities', 'ability',
       'ability create', 'ability customize', 'ability edit',
       'ability rename', 'ability upload', 'abit', 'abject', 'abkhazia',
       'able', 'able add', 'able block', 'able change', 'able come',
       'able contribute', 'able determine', 'able edit', 'able find',
       'able get', 'able help', 'able keep', 'able make', 'able post',
       'able provide', 'able read', 'able see', 'able stand', 'able take',
       'able tell', 'able understand', 'able use', 'able work',
       'able write', 'abnormal', 'aboard', 'aboli

In [8]:
print("TF-IDF shape (train):", X_train_tfidf.shape)
print("TF-IDF shape (val):  ", X_val_tfidf.shape)
print("TF-IDF shape (test):", X_test_tfidf.shape)

TF-IDF shape (train): (130038, 50000)
TF-IDF shape (val):   (15958, 50000)
TF-IDF shape (test): (15958, 50000)


In [9]:
def exact_match_ratio(y_true, y_pred):
    return np.mean(np.all(y_true == y_pred, axis=1))

In [10]:
def evaluate_model(model, X_test, y_test):
    # Predict
    y_pred_proba = model.predict_proba(X_test)   # shape (N, L)
    y_pred = (y_pred_proba > 0.5).astype(int)    # threshold = 0.5

    results = {
        "ROC-AUC": roc_auc_score(
            y_test,
            y_pred_proba,
            average="macro"     # ho·∫∑c "micro"
        ),
        "F1_Macro": f1_score(
            y_test,
            y_pred,
            average="macro"
        ),
        "Hamming_Loss": hamming_loss(
            y_test,
            y_pred
        ),
        "EMR": accuracy_score(
            y_test,
            y_pred
        )
    }

    return pd.DataFrame([results])

In [11]:
log_model = ManualLogisticRegressionOneVsRest(learning_rate=1.0,n_iters=1000,lambda_param=0.1)
log_model.fit(X_train_tfidf,y_train)

B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán 6 m√¥ h√¨nh Binary v·ªõi lr=1.0, lambda=0.1...


In [12]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
log_results = evaluate_model(log_model, X_test_tfidf, y_test)

print("=== Logistic Regression Performance ===")
display(log_results)
print("Mean ROC-AUC:", log_results["ROC-AUC"])

=== Logistic Regression Performance ===


Unnamed: 0,ROC-AUC,F1_Macro,Hamming_Loss,EMR
0,0.965703,0.475761,0.03787,0.873042


Mean ROC-AUC: 0    0.965703
Name: ROC-AUC, dtype: float64


In [13]:
def find_optimal_thresholds(y_true, y_prob):
    """
    T√¨m ng∆∞·ª°ng t·ªëi ∆∞u cho t·ª´ng nh√£n d·ª±a tr√™n F1-score
    (ƒê√£ fix l·ªói KeyError do Pandas DataFrame)
    """
    # 1. √âP KI·ªÇU V·ªÄ NUMPY ARRAY ƒê·ªÇ TR√ÅNH L·ªñI DATAFRAME SLICING
    # D√π ƒë·∫ßu v√†o l√† DataFrame, List hay Array th√¨ d√≤ng n√†y ƒë·ªÅu x·ª≠ l√Ω ƒë∆∞·ª£c
    y_true = np.array(y_true)
    y_prob = np.array(y_prob)
    
    best_thresholds = []
    thresholds = np.arange(0.01, 0.9, 0.005) # Qu√©t t·ª´ 0.1 ƒë·∫øn 0.9
    
    n_labels = y_true.shape[1]
    
    print(f"ƒêang t√¨m ng∆∞·ª°ng cho {n_labels} nh√£n...")
    
    for i in range(n_labels):
        y_t = y_true[:, i]
        y_p = y_prob[:, i]
        
        best_score = -1
        best_thresh = 0.5
        
        # Ki·ªÉm tra n·∫øu c·ªôt nh√£n to√†n l√† 0 (v√≠ d·ª• t·∫≠p Val kh√¥ng c√≥ m·∫´u threat n√†o)
        if np.sum(y_t) == 0:
            print(f"Label {i}: Kh√¥ng c√≥ m·∫´u Positive n√†o trong t·∫≠p Val. Gi·ªØ nguy√™n threshold 0.5")
            best_thresholds.append(0.5)
            continue

        for thresh in thresholds:
            pred_binary = (y_p > thresh).astype(int)
            score = f1_score(y_t, pred_binary)
            
            if score > best_score:
                best_score = score
                best_thresh = thresh
                
        best_thresholds.append(best_thresh)
        print(f"Label {i}: Best Threshold = {best_thresh:.2f}, F1-Score = {best_score:.4f}")
        
    return best_thresholds
y_val_proba = log_model.predict_proba(X_val_tfidf)
# --- CH·∫†Y L·∫†I ---
# Kh√¥ng c·∫ßn s·ª≠a code g·ªçi h√†m, ch·ªâ c·∫ßn ch·∫°y l·∫°i cell n√†y
print("B·∫Øt ƒë·∫ßu t√¨m ng∆∞·ª°ng...")
best_thresholds = find_optimal_thresholds(y_val, y_val_proba)
print("\nDanh s√°ch ng∆∞·ª°ng t·ªëi ∆∞u:", best_thresholds)

B·∫Øt ƒë·∫ßu t√¨m ng∆∞·ª°ng...
ƒêang t√¨m ng∆∞·ª°ng cho 6 nh√£n...
Label 0: Best Threshold = 0.53, F1-Score = 0.6972
Label 1: Best Threshold = 0.73, F1-Score = 0.4000
Label 2: Best Threshold = 0.52, F1-Score = 0.7435
Label 3: Best Threshold = 0.84, F1-Score = 0.4490
Label 4: Best Threshold = 0.52, F1-Score = 0.6694
Label 5: Best Threshold = 0.69, F1-Score = 0.3395

Danh s√°ch ng∆∞·ª°ng t·ªëi ∆∞u: [np.float64(0.5349999999999999), np.float64(0.7349999999999999), np.float64(0.5249999999999999), np.float64(0.8399999999999999), np.float64(0.5249999999999999), np.float64(0.695)]


In [14]:
from sklearn.metrics import f1_score, roc_auc_score, hamming_loss, accuracy_score
import pandas as pd
import numpy as np

def evaluate_with_thresholds(model, X_test, y_test, thresholds):
    """
    ƒê√°nh gi√° m√¥ h√¨nh v·ªõi danh s√°ch ng∆∞·ª°ng (thresholds) ri√™ng cho t·ª´ng nh√£n.
    Tr·∫£ v·ªÅ DataFrame ch·ª©a: ROC-AUC, F1-Samples, Hamming Loss, EMR.
    """
    # 1. D·ª± ƒëo√°n x√°c su·∫•t
    print("ƒêang d·ª± ƒëo√°n x√°c su·∫•t tr√™n t·∫≠p Test...")
    y_prob = model.predict_proba(X_test)
    
    # 2. √âp ki·ªÉu v·ªÅ Numpy Array (quan tr·ªçng ƒë·ªÉ tr√°nh l·ªói)
    y_test_np = np.array(y_test)
    y_prob_np = np.array(y_prob)
    
    # 3. √Åp d·ª•ng ng∆∞·ª°ng (Thresholding)
    # T·∫°o ma tr·∫≠n d·ª± ƒëo√°n nh·ªã ph√¢n (0/1) d·ª±a tr√™n ng∆∞·ª°ng t·ª´ng c·ªôt
    y_pred = np.zeros_like(y_prob_np)
    
    print("ƒêang √°p d·ª•ng ng∆∞·ª°ng t·ªëi ∆∞u...")
    for i in range(y_prob_np.shape[1]):
        # N·∫øu x√°c su·∫•t >= ng∆∞·ª°ng c·ªßa nh√£n i -> g√°n b·∫±ng 1, ng∆∞·ª£c l·∫°i 0
        y_pred[:, i] = (y_prob_np[:, i] >= thresholds[i]).astype(int)
        
    # 4. T√≠nh to√°n c√°c ch·ªâ s·ªë
    print("ƒêang t√≠nh to√°n metrics...")
    
    # ROC-AUC: T√≠nh tr√™n x√°c su·∫•t (y_prob), kh√¥ng ph·ª• thu·ªôc threshold
    # average='macro': T√≠nh trung b√¨nh c·ªông AUC c·ªßa c√°c nh√£n
    roc_auc = roc_auc_score(y_test_np, y_prob_np, average='macro')
    
    # F1-Samples: T√≠nh F1 cho t·ª´ng nh√£n r·ªìi l·∫•y trung b√¨nh
    f1_samples = f1_score(y_test_np, y_pred, average='macro')
    
    # Hamming Loss: T·ª∑ l·ªá c√°c nh√£n b·ªã d·ª± ƒëo√°n sai tr√™n t·ªïng s·ªë nh√£n
    # (C√†ng th·∫•p c√†ng t·ªët)
    h_loss = hamming_loss(y_test_np, y_pred)
    
    # Exact Match Ratio (EMR): T·ª∑ l·ªá c√°c m·∫´u ƒë∆∞·ª£c d·ª± ƒëo√°n ƒë√∫ng HO√ÄN TO√ÄN c·∫£ 6 nh√£n
    # (Trong sklearn, accuracy_score cho multi-label ch√≠nh l√† EMR)
    emr = exact_match_ratio(y_test_np, y_pred)
    
    # 5. ƒê√≥ng g√≥i k·∫øt qu·∫£
    results = {
        "Metric": ["ROC-AUC", "F1 Score (Samples)", "Hamming Loss", "Exact Match Ratio"],
        "Value": [roc_auc, f1_samples, h_loss, emr]
    }
    
    return pd.DataFrame(results)

# --- C√ÅCH S·ª¨ D·ª§NG ---
# Gi·∫£ s·ª≠ b·∫°n ƒë√£ c√≥ best_thresholds t·ª´ b∆∞·ªõc tr∆∞·ªõc
# V√† X_test_tfidf, y_test ƒë√£ s·∫µn s√†ng

df_results = evaluate_with_thresholds(log_model, X_test_tfidf, y_test, best_thresholds)

print("\n=== K·∫æT QU·∫¢ ƒê√ÅNH GI√Å CU·ªêI C√ôNG ===")
print(df_results)

ƒêang d·ª± ƒëo√°n x√°c su·∫•t tr√™n t·∫≠p Test...
ƒêang √°p d·ª•ng ng∆∞·ª°ng t·ªëi ∆∞u...
ƒêang t√≠nh to√°n metrics...

=== K·∫æT QU·∫¢ ƒê√ÅNH GI√Å CU·ªêI C√ôNG ===
               Metric     Value
0             ROC-AUC  0.965703
1  F1 Score (Samples)  0.534331
2        Hamming Loss  0.025139
3   Exact Match Ratio  0.901805


# Tinh ch·ªânh Logistic Regression v√† TF IDF

In [16]:
param_grid = {
    # Tinh ch·ªânh TF-IDF
    'tfidf__max_features': [50000],
    'tfidf__ngram_range': [(1, 2), (1,3)],
    'tfidf__min_df': [1,3,5],
    'tfidf__max_df': [0.8, 0.9],

    # Tinh ch·ªânh Model Multi-label
    'clf__learning_rate': [1.0, 0.1],
    'clf__lambda_param': [1.0, 0.1, 0.01],
    'clf__n_iters': [1000]
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode')),
    ('clf', ManualLogisticRegressionOneVsRest())
])

cv_strategy = IterativeStratification(n_splits=3, order=1)

roc_auc_score_macro = make_scorer(
    roc_auc_score,
    needs_proba=True,
    average="macro"
)

grid = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=cv_strategy, # B·∫Øt bu·ªôc ph·∫£i set c√°i n√†y
    scoring=roc_auc_score_macro, # T∆∞∆°ng ƒë∆∞∆°ng ROC-AUC Macro
    verbose=3,
    n_jobs=-1
)

print("B·∫Øt ƒë·∫ßu GridSearch Multi-label...")
grid.fit(X_train,y_train)


print("\n--- K·∫æT QU·∫¢ ---")
print(f"Best ROC_AUC_MACRO: {grid.best_score_:.4f}")
print("Best Params:", grid.best_params_)

B·∫Øt ƒë·∫ßu GridSearch Multi-label...
Fitting 3 folds for each of 72 candidates, totalling 216 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán 6 m√¥ h√¨nh Binary v·ªõi lr=1.0, lambda=1.0...

--- K·∫æT QU·∫¢ ---
Best ROC_AUC_MACRO: nan
Best Params: {'clf__lambda_param': 1.0, 'clf__learning_rate': 1.0, 'clf__n_iters': 1000, 'tfidf__max_df': 0.8, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}


In [19]:
best_model = grid.best_estimator_
log_results = evaluate_with_thresholds(best_model, X_test, y_test, best_thresholds)

print("=== Logistic Regression (Tuned) Performance ===")
display(log_results)

ƒêang d·ª± ƒëo√°n x√°c su·∫•t tr√™n t·∫≠p Test...
ƒêang √°p d·ª•ng ng∆∞·ª°ng t·ªëi ∆∞u...
ƒêang t√≠nh to√°n metrics...
=== Logistic Regression (Tuned) Performance ===


Unnamed: 0,Metric,Value
0,ROC-AUC,0.965626
1,F1 Score (Samples),0.53558
2,Hamming Loss,0.025128
3,Exact Match Ratio,0.901679


In [20]:
joblib.dump(best_model, 'best_logistic_custom_model.joblib')
print("ƒê√£ l∆∞u m√¥ h√¨nh th√†nh c√¥ng!")

ƒê√£ l∆∞u m√¥ h√¨nh th√†nh c√¥ng!


In [None]:
# L∆∞u thresholds t·ªëi ∆∞u ƒë·ªÉ s·ª≠ d·ª•ng trong production
joblib.dump(best_thresholds, 'optimal_thresholds.joblib')
print(f"\nƒê√£ l∆∞u optimal thresholds: {best_thresholds}")
print("\nTh√¥ng tin chi ti·∫øt:")
for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    print(f"  {label}: {best_thresholds[i]:.2f}")

In [42]:
import numpy as np
import pandas as pd

def analyze_sample_strict_format(model_ovr, text_input, vectorizer, true_labels_vec, optimal_thresholds):
    """
    In ra ph√¢n t√≠ch chi ti·∫øt theo format y√™u c·∫ßu:
    C√¢u -> T·ª´ l·ªçc -> Chi ti·∫øt t·ª´ng nh√£n (B·∫£ng t√≠nh W*x -> Logit -> Prob -> K·∫øt lu·∫≠n).
    """
    # ==========================================================================
    # 1. C√ÇU BAN ƒê·∫¶U & C√ÅC NH√ÉN T∆Ø∆†NG ·ª®NG
    # ==========================================================================
    # L·∫•y t√™n c√°c nh√£n th·ª±c t·∫ø
    label_names_all = model_ovr.labels
    true_active_labels = [label_names_all[i] for i, val in enumerate(true_labels_vec) if val == 1]
    
    # N·∫øu kh√¥ng c√≥ nh√£n n√†o (m·∫´u s·∫°ch)
    if not true_active_labels:
        true_lbl_str = "KH√îNG C√ì (M·∫´u s·∫°ch/Clean)"
    else:
        true_lbl_str = ", ".join([l.upper() for l in true_active_labels])

    print("\n" + "#" * 100)
    print(f"üìå C√ÇU BAN ƒê·∫¶U: \"{str(text_input).strip()}\"")
    print(f"üëâ C√ÅC LABEL T∆Ø∆†NG ·ª®NG TH·ª∞C T·∫æ: [{true_lbl_str}]")
    print("-" * 100)

    # ==========================================================================
    # 2. DANH S√ÅCH C√ÅC T·ª™ S√ÄNG L·ªåC & GI√Å TR·ªä TF-IDF
    # ==========================================================================
    x_vec = vectorizer.transform([text_input])
    feature_indices = x_vec.indices
    tfidf_values = x_vec.data
    feature_names = vectorizer.get_feature_names_out()
    
    print("DATA INPUT: DANH S√ÅCH C√ÅC T·ª™ ƒê∆Ø·ª¢C L·ªåC K√àM GI√Å TR·ªä TF-IDF:")
    if len(feature_indices) == 0:
        print("   (Kh√¥ng t√¨m th·∫•y t·ª´ n√†o trong t·ª´ ƒëi·ªÉn)")
    else:
        # In ra d·∫°ng list d·ªÖ nh√¨n
        words_display = [f"'{feature_names[i]}': {v:.4f}" for i, v in zip(feature_indices, tfidf_values)]
        print("   " + " | ".join(words_display))
    
    print("=" * 100)

    # ==========================================================================
    # 3. PH√ÇN T√çCH CHI TI·∫æT T·ª™NG NH√ÉN (L√ÄM ƒê·ªÄU H·∫æT C√ÅC NH√ÉN)
    # ==========================================================================
    
    for i, label_name in enumerate(label_names_all):
        sub_model = model_ovr.models[i]
        threshold = optimal_thresholds[i]
        bias = sub_model.bias
        weights = sub_model.weights
        
        print(f"\nüè∑Ô∏è  PH√ÇN T√çCH NH√ÉN: {label_name.upper()}")
        print(f"   {'-'*75}")
        print(f"   {'T·ª™ (TOKEN)':<20} | {'TR·ªåNG S·ªê (W)':<15} | {'TF-IDF (x)':<15} | {'W * x':<15}")
        print(f"   {'-'*75}")
        
        z_sum_words = 0
        
        # Duy·ªát qua t·ª´ng t·ª´ ƒë·ªÉ in b·∫£ng t√≠nh
        for idx, tfidf_val in zip(feature_indices, tfidf_values):
            word = feature_names[idx]
            w_val = weights[idx]
            contribution = w_val * tfidf_val
            z_sum_words += contribution
            
            # In d√≤ng chi ti·∫øt
            print(f"   {word:<20} | {w_val:<15.4f} | {tfidf_val:<15.4f} | {contribution:<15.4f}")
            
        print(f"   {'-'*75}")
        
        # T√≠nh to√°n Logit & X√°c su·∫•t
        total_logit = bias + z_sum_words
        prob = 1 / (1 + np.exp(-total_logit))
        
        # K·∫øt lu·∫≠n
        pred_val = 1 if prob > threshold else 0
        true_val = true_labels_vec[i]
        match_str = "‚úÖ ƒê√öNG" if pred_val == true_val else "‚ùå SAI"
        
        print(f"   ‚ñ∫ Bias (H·ªá s·ªë t·ª± do) : {bias:.4f}")
        print(f"   ‚ñ∫ T·ªïng ƒë√≥ng g√≥p t·ª´   : {z_sum_words:.4f}")
        print(f"   ------------------------------")
        print(f"   ‚ñ∫ GI√Å TR·ªä LOGIT (z)  : {total_logit:.4f}")
        print(f"   ‚ñ∫ X√ÅC SU·∫§T (P)       : {prob:.4f}")
        print(f"   ‚ñ∫ SO S√ÅNH NG∆Ø·ª†NG     : {prob:.4f} {'L·ªöN H∆†N' if prob > threshold else 'NH·ªé H∆†N'} {threshold:.4f}")
        print(f"   ‚ñ∫ K·∫æT LU·∫¨N           : D·ª± ƒëo√°n = {pred_val} | Th·ª±c t·∫ø = {true_val} -> {match_str}")
    
    print("\n" + "="*100 + "\n")

# ==============================================================================
# PH·∫¶N CH·∫†Y: CH·ªåN 3 M·∫™U ƒê√öNG V√Ä 3 M·∫™U SAI ƒê·ªÇ IN
# ==============================================================================

# L·∫•y c√°c th√†nh ph·∫ßn
best_pipeline = grid.best_estimator_
vectorizer_final = best_pipeline.named_steps['tfidf']
clf_final = best_pipeline.named_steps['clf']

y_test_vals = y_test.values if hasattr(y_test, 'values') else np.array(y_test)
# Reset index cho X_test ƒë·ªÉ kh·ªõp v·ªã tr√≠ v·ªõi numpy array
X_test_reset = X_test.reset_index(drop=True) if hasattr(X_test, 'reset_index') else X_test

# 2. T√≠nh to√°n l·∫°i d·ª± ƒëo√°n & tr·∫°ng th√°i ƒê√∫ng/Sai
y_prob_run = best_pipeline.predict_proba(X_test)
y_pred_run = np.zeros_like(y_prob_run)
for i in range(len(opt_thresholds)):
    y_pred_run[:, i] = (y_prob_run[:, i] > opt_thresholds[i]).astype(int)

# Check ƒë√∫ng sai cho t·ª´ng m·∫´u (ƒê√∫ng l√† ph·∫£i kh·ªõp c·∫£ 6 nh√£n)
is_correct_run = np.all(y_test_vals == y_pred_run, axis=1)

# 3. L·ªåC RI√äNG C√ÅC M·∫™U "TH√ö V·ªä" (C√ì NH√ÉN)

# --- M·∫™U ƒê√öNG V√Ä C√ì NH√ÉN (Toxic Correct) ---
# ƒêi·ªÅu ki·ªán: (ƒê√∫ng ho√†n to√†n) V√Ä (T·ªïng nh√£n th·ª±c t·∫ø > 0)
correct_toxic_mask = is_correct_run & (np.sum(y_test_vals, axis=1) > 0)
correct_toxic_indices = np.where(correct_toxic_mask)[0]

# --- M·∫™U SAI (Nh∆∞ c≈©) ---
incorrect_indices = np.where(~is_correct_run)[0]

print(f"üìä T√åM TH·∫§Y TRONG T·∫¨P TEST:")
print(f"- S·ªë m·∫´u S·∫°ch (Clean) d·ª± ƒëo√°n ƒë√∫ng: {np.sum(is_correct_run & (np.sum(y_test_vals, axis=1) == 0))}")
print(f"- S·ªë m·∫´u ƒê·ªôc h·∫°i (Toxic) d·ª± ƒëo√°n ƒë√∫ng: {len(correct_toxic_indices)} <--- Ta s·∫Ω in c√°i n√†y")
print(f"- S·ªë m·∫´u d·ª± ƒëo√°n sai: {len(incorrect_indices)}")

# ==============================================================================
# TI·∫æN H√ÄNH IN THEO FORMAT B·∫†N Y√äU C·∫¶U
# ==============================================================================

# --- 1. IN 3 M·∫™U ƒê√öNG (NH∆ØNG PH·∫¢I L√Ä M·∫™U C√ì NH√ÉN ƒê·ªòC H·∫†I) ---
print("\n" + "‚òÖ"*40 + " 3 M·∫™U ƒê√öNG (C√ì NH√ÉN TOXIC) " + "‚òÖ"*40)

if len(correct_toxic_indices) == 0:
    print("‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y m·∫´u n√†o c√≥ nh√£n Toxic m√† m√¥ h√¨nh ƒëo√°n ƒë√∫ng c·∫£! (Model qu√° y·∫øu ho·∫∑c Threshold qu√° cao)")
else:
    # L·∫•y 3 m·∫´u ƒë·∫ßu ti√™n t√¨m ƒë∆∞·ª£c
    for idx in correct_toxic_indices[:3]:
        analyze_sample_strict_format(
            model_ovr=clf_final,
            text_input=X_test_reset.iloc[idx] if hasattr(X_test_reset, 'iloc') else X_test_reset[idx],
            vectorizer=vectorizer_final,
            true_labels_vec=y_test_vals[idx],
            optimal_thresholds=best_thresholds
        )

# --- 2. IN 3 M·∫™U SAI (∆ØU TI√äN FALSE NEGATIVE ƒê·ªÇ PH√ÇN T√çCH) ---
print("\n" + "‚òÖ"*40 + " 3 M·∫™U SAI (ƒêA D·∫†NG) " + "‚òÖ"*40)

diverse_errors = []

# Lo·∫°i 1: S√≥t l·ªçt (Nguy hi·ªÉm nh·∫•t) - Th·ª±c t·∫ø c√≥ nh√£n, nh∆∞ng m√°y ƒëo√°n to√†n 0
fn_indices = np.where((np.sum(y_test_vals, axis=1) > 0) & (np.sum(y_pred_run, axis=1) == 0))[0]
if len(fn_indices) > 0: diverse_errors.append(fn_indices[0])

# Lo·∫°i 2: B√°o ƒë·ªông gi·∫£ - Th·ª±c t·∫ø s·∫°ch, nh∆∞ng m√°y ƒëo√°n c√≥ nh√£n
fp_indices = np.where((np.sum(y_test_vals, axis=1) == 0) & (np.sum(y_pred_run, axis=1) > 0))[0]
if len(fp_indices) > 0: diverse_errors.append(fp_indices[0])

# Lo·∫°i 3: Sai kh√°c (Sai 1 ph·∫ßn) - C√≥ nh√£n, m√°y c≈©ng ƒëo√°n c√≥ nh√£n, nh∆∞ng kh√¥ng kh·ªõp nhau ho√†n to√†n
partial_error_mask = (np.sum(y_test_vals, axis=1) > 0) & (np.sum(y_pred_run, axis=1) > 0) & (~is_correct_run)
partial_indices = np.where(partial_error_mask)[0]
if len(partial_indices) > 0: diverse_errors.append(partial_indices[0])

# N·∫øu kh√¥ng ƒë·ªß 3 lo·∫°i tr√™n th√¨ l·∫•y th√™m m·∫´u sai ng·∫´u nhi√™n
if len(diverse_errors) < 3:
    remaining = [i for i in incorrect_indices if i not in diverse_errors]
    diverse_errors.extend(remaining[:3-len(diverse_errors)])

for idx in diverse_errors:
    analyze_sample_strict_format(
        model_ovr=clf_final,
        text_input=X_test_reset.iloc[idx] if hasattr(X_test_reset, 'iloc') else X_test_reset[idx],
        vectorizer=vectorizer_final,
        true_labels_vec=y_test_vals[idx],
        optimal_thresholds=best_thresholds
    )

üìä T√åM TH·∫§Y TRONG T·∫¨P TEST:
- S·ªë m·∫´u S·∫°ch (Clean) d·ª± ƒëo√°n ƒë√∫ng: 14038
- S·ªë m·∫´u ƒê·ªôc h·∫°i (Toxic) d·ª± ƒëo√°n ƒë√∫ng: 355 <--- Ta s·∫Ω in c√°i n√†y
- S·ªë m·∫´u d·ª± ƒëo√°n sai: 1565

‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ 3 M·∫™U ƒê√öNG (C√ì NH√ÉN TOXIC) ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ

####################################################################################################
üìå C√ÇU BAN ƒê·∫¶U: "tony sidaway obviously fistfuckee loves arm ass"
üëâ C√ÅC LABEL T∆Ø∆†NG ·ª®NG TH·ª∞C T·∫æ: [TOXIC, OBSCENE, INSULT]
----------------------------------------------------------------------------------------------------
DATA INPUT: DANH S√ÅCH C√ÅC T·ª™ ƒê∆Ø·ª¢C L·ªåC K√àM GI√Å TR·ªä TF-IDF:
   'arm': 0.4063 | 'ass': 0.2882 | 'loves': 0.3877 | 'obviously': 0.2653 | 'sidaway': 0.4426 | 'tony'

In [50]:
best_model.named_steps["tfidf"].vocabulary_.__sizeof__()

61516528