In [None]:
import os
import csv
import json
import joblib
import numpy as np
from typing import List, Dict, Tuple
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
random_state =42

vocab_size = 3000 

RAW_CSV = "data/raw/sms.csv"              # file g·ªëc: 2 c·ªôt "label","text"
TRAIN_OUT = "data/processed/train.csv"    # output train
TEST_OUT  = "data/processed/test.csv"     # output test
VEC_PKL   = "artifacts/vectorizer.pkl"    # pickle vectorizer cho B∆∞·ªõc 3/4
VOCAB_TXT = "artifacts/vocab.txt"         # vocab (tham kh·∫£o)
TEST_SIZE = 0.2                           # 80/20 split

# -----------------------------
# 1) STOPWORDS & CLEANING
# -----------------------------
STOPWORDS = {
    "a","an","the","is","are","am","was","were","be","been","being","i","you","he","she","it","we","they","me","him","her","us","them",
    "this","that","these","those","there","here","of","to","in","on","for","from","with","by","at","as","about","into","over","after",
    "before","between","and","or","but","if","then","so","because","while","than","though","although","not","no","do","does","did","doing",
    "done","dont","didnt","doesnt","isnt","arent","wasnt","werent","cant","cannot","my","your","his","her","its","our","their",
    "have","has","had","having","will","would","shall","should","can","could","may","might","must",
    "im","ive","youre","hes","shes","weve","theyre","ill","youll","dont","cant","wont","didnt","couldnt","shouldnt","wouldnt","lets"
}

def keep_letters_and_spaces(s: str) -> str:
    """
    Ch·ªâ gi·ªØ l·∫°i ch·ªØ c√°i (a-z) v√† kho·∫£ng tr·∫Øng
    
    Tham s·ªë:
        s: chu·ªói ƒë√£ lowercase
        
    Tr·∫£ v·ªÅ:
        Chu·ªói ch·ªâ ch·ª©a a-z v√† space
        
    V√≠ d·ª•:
        "hello123!@#world" -> "hello   world"
    """
    out = []
    for ch in s:
        if 'a' <= ch <= 'z' or ch == ' ':
            out.append(ch)
        else:
            out.append(' ')
    return ''.join(out)


def clean_text(text: str, stopwords: set) -> str: #L√†m s·∫°ch vƒÉn b·∫£n: lowercase -> lo·∫°i k√Ω t·ª± ƒë·∫∑c bi·ªát -> lo·∫°i stopword
    """
        text: vƒÉn b·∫£n c·∫ßn l√†m s·∫°ch
        stopwords: t·∫≠p h·ª£p c√°c t·ª´ d·ª´ng
        
    Tr·∫£ v·ªÅ:
        VƒÉn b·∫£n ƒë√£ l√†m s·∫°ch
        
    C√°c b∆∞·ªõc:
        1. Chuy·ªÉn v·ªÅ ch·ªØ th∆∞·ªùng
        2. Lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát, ch·ªâ gi·ªØ a-z v√† space
        3. Lo·∫°i b·ªè kho·∫£ng tr·∫Øng th·ª´a
        4. Lo·∫°i b·ªè stopwords
        5. Lo·∫°i b·ªè t·ª´ c√≥ ƒë·ªô d√†i <= 1
    """
    text = text.lower()# B∆∞·ªõc 1: lowercase
    

    text = keep_letters_and_spaces(text)# B∆∞·ªõc 2: ch·ªâ gi·ªØ ch·ªØ c√°i v√† space
    
    text = ' '.join(text.split()) # B∆∞·ªõc 3: lo·∫°i kho·∫£ng tr·∫Øng th·ª´a
    
    
    words = text.split()
    words = [w for w in words if w not in stopwords and len(w) > 1]# B∆∞·ªõc 4 & 5: t√°ch t·ª´, lo·∫°i stopwords v√† t·ª´ ng·∫Øn
    
    return ' '.join(words)


# -----------------------------
# *) ƒê·ªåC V√Ä L√ÄM S·∫†CH D·ªÆ LI·ªÜU
# -----------------------------
def load_and_clean_data(csv_path: str, stopwords: set) -> Tuple[List[str], List[int]]:
    """
    ƒê·ªçc file CSV v√† l√†m s·∫°ch d·ªØ li·ªáu
    
    Tham s·ªë:
        csv_path: ƒë∆∞·ªùng d·∫´n file CSV (c·ªôt 1: label, c·ªôt 2: text)
        stopwords: t·∫≠p stopwords
        
    Tr·∫£ v·ªÅ:
        (texts, labels) - danh s√°ch vƒÉn b·∫£n ƒë√£ l√†m s·∫°ch v√† nh√£n (0=ham, 1=spam)
    """
    print(f"\nüìñ ƒê·ªçc d·ªØ li·ªáu t·ª´: {csv_path}")
    
    texts = []
    labels = []
    
    # ƒê·ªçc file CSV
    with open(csv_path, 'r', encoding='latin-1') as f:
        reader = csv.reader(f)
        header = next(reader)  # b·ªè qua header
        
        for row in reader:
            if len(row) < 2:
                continue
                
            label_str = row[0].strip().lower()  # 'ham' ho·∫∑c 'spam'
            raw_text = row[1].strip()
            
            
            cleaned = clean_text(raw_text, stopwords) # L√†m s·∫°ch vƒÉn b·∫£n
            
            
            label_int = 1 if label_str == 'spam' else 0 # Chuy·ªÉn label th√†nh s·ªë: ham=0, spam=1
            
                             
            if cleaned:# Ch·ªâ gi·ªØ l·∫°i n·∫øu vƒÉn b·∫£n kh√¥ng r·ªóng sau khi l√†m s·∫°ch
                texts.append(cleaned)
                labels.append(label_int)
    
    print(f"‚úÖ ƒê·ªçc th√†nh c√¥ng {len(texts)} tin nh·∫Øn")
    print(f"   - HAM (0): {labels.count(0)} tin")
    print(f"   - SPAM (1): {labels.count(1)} tin")
    
    return texts, labels


# -----------------------------
# 3) CHIA TRAIN/TEST
# -----------------------------
def split_train_test(texts: List[str], labels: List[int], 
                     test_size: float = 0.2, 
                     random_state: int = 42) -> Tuple:
    """
    Chia d·ªØ li·ªáu th√†nh t·∫≠p train v√† test
    
    Tham s·ªë:
        texts: danh s√°ch vƒÉn b·∫£n
        labels: danh s√°ch nh√£n
        test_size: t·ª∑ l·ªá test (0.2 = 20%)
        random_state: seed cho random
        
    Tr·∫£ v·ªÅ:
        X_train, X_test, y_train, y_test
    """
    print(f"\n Chia d·ªØ li·ªáu: {int((1-test_size)*100)}% train, {int(test_size*100)}% test")
    
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, 
        test_size=test_size, 
        random_state=random_state,
        stratify=labels  # ƒë·∫£m b·∫£o t·ª∑ l·ªá ham/spam ƒë·ªÅu trong train v√† test
    )
    
    print(f"‚úÖ Train: {len(X_train)} m·∫´u")
    print(f"‚úÖ Test:  {len(X_test)} m·∫´u")
    
    return X_train, X_test, y_train, y_test


# -----------------------------
# 4) VECTOR H√ìA VƒÇN B·∫¢N
# -----------------------------
def create_vectorizer(vocab_size: int = 3000, method: str = 'tfidf'):
    """
    T·∫°o vectorizer ƒë·ªÉ chuy·ªÉn vƒÉn b·∫£n th√†nh vector s·ªë
    
    Tham s·ªë:
        vocab_size: s·ªë l∆∞·ª£ng t·ª´ trong vocabulary
        method: 'tfidf' ho·∫∑c 'count'
        
    Tr·∫£ v·ªÅ:
        Vectorizer object
        
    Gi·∫£i th√≠ch:
        - TF-IDF: Term Frequency - Inverse Document Frequency
          + ƒêo l∆∞·ªùng m·ª©c ƒë·ªô quan tr·ªçng c·ªßa t·ª´ trong vƒÉn b·∫£n
          + T·ª´ xu·∫•t hi·ªán nhi·ªÅu trong 1 vƒÉn b·∫£n nh∆∞ng √≠t trong to√†n b·ªô ‚Üí quan tr·ªçng
          
        - Count: ƒê∆°n gi·∫£n ƒë·∫øm s·ªë l·∫ßn xu·∫•t hi·ªán c·ªßa t·ª´
    """
    print(f"\nüî¢ T·∫°o vectorizer ({method.upper()}) v·ªõi vocab_size={vocab_size}")
    
    if method == 'tfidf':
        vectorizer = TfidfVectorizer(
            max_features=vocab_size,  # gi·ªõi h·∫°n s·ªë t·ª´
            ngram_range=(1, 2),       # unigram v√† bigram (t·ª´ ƒë∆°n v√† c·ª•m 2 t·ª´)
            min_df=2,                 # t·ª´ ph·∫£i xu·∫•t hi·ªán √≠t nh·∫•t 2 l·∫ßn
            max_df=0.95               # lo·∫°i t·ª´ xu·∫•t hi·ªán qu√° nhi·ªÅu (>95% vƒÉn b·∫£n)
        )
    else:
        vectorizer = CountVectorizer(
            max_features=vocab_size,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.95
        )
    
    return vectorizer


def fit_and_transform(vectorizer, X_train: List[str], X_test: List[str]):
    """
    Fit vectorizer tr√™n train v√† transform c·∫£ train l·∫´n test
    
    Tham s·ªë:
        vectorizer: TfidfVectorizer ho·∫∑c CountVectorizer
        X_train: danh s√°ch vƒÉn b·∫£n train
        X_test: danh s√°ch vƒÉn b·∫£n test
        
    Tr·∫£ v·ªÅ:
        X_train_vec, X_test_vec (d·∫°ng sparse matrix)
        
    L∆∞u √Ω:
        - Fit ch·ªâ tr√™n train ƒë·ªÉ tr√°nh data leakage
        - Transform c·∫£ train v√† test b·∫±ng c√πng vocabulary
    """
    print("‚öôÔ∏è ƒêang fit vectorizer tr√™n t·∫≠p train...")
    X_train_vec = vectorizer.fit_transform(X_train)
    
    print("‚öôÔ∏è ƒêang transform t·∫≠p test...")
    X_test_vec = vectorizer.transform(X_test)
    
    print(f"‚úÖ Train vector shape: {X_train_vec.shape}")
    print(f"‚úÖ Test vector shape:  {X_test_vec.shape}")
    print(f"üìö Vocabulary size: {len(vectorizer.vocabulary_)}")
    
    return X_train_vec, X_test_vec


# -----------------------------
# 5) L∆ØU D·ªÆ LI·ªÜU
# -----------------------------
def save_processed_data(X_train, y_train, X_test, y_test, 
                       train_out: str, test_out: str):
    """
    L∆∞u d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω ra file CSV
    
    Tham s·ªë:
        X_train, y_train: d·ªØ li·ªáu train (vƒÉn b·∫£n v√† nh√£n)
        X_test, y_test: d·ªØ li·ªáu test
        train_out: ƒë∆∞·ªùng d·∫´n file train output
        test_out: ƒë∆∞·ªùng d·∫´n file test output
    """
    print(f"\n L∆∞u d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω...")
    
    # T·∫°o th∆∞ m·ª•c n·∫øu ch∆∞a c√≥
    os.makedirs(os.path.dirname(train_out), exist_ok=True)
    
    # L∆∞u train
    train_df = pd.DataFrame({
        'text': X_train,
        'label': y_train
    })
    train_df.to_csv(train_out, index=False, encoding='utf-8')
    print(f"‚úÖ ƒê√£ l∆∞u train: {train_out}")
    
    # L∆∞u test
    test_df = pd.DataFrame({
        'text': X_test,
        'label': y_test
    })
    test_df.to_csv(test_out, index=False, encoding='utf-8')
    print(f"‚úÖ ƒê√£ l∆∞u test: {test_out}")


def save_vectorizer_and_vocab(vectorizer, vec_pkl: str, vocab_txt: str):
    """
    L∆∞u vectorizer v√† vocabulary
    
    Tham s·ªë:
        vectorizer: ƒë·ªëi t∆∞·ª£ng vectorizer ƒë√£ fit
        vec_pkl: ƒë∆∞·ªùng d·∫´n l∆∞u vectorizer (pickle)
        vocab_txt: ƒë∆∞·ªùng d·∫´n l∆∞u vocabulary (text)
    """
    print(f"\nüíæ L∆∞u vectorizer v√† vocabulary...")
    
    # T·∫°o th∆∞ m·ª•c
    os.makedirs(os.path.dirname(vec_pkl), exist_ok=True)
    
    # L∆∞u vectorizer
    joblib.dump(vectorizer, vec_pkl)
    print(f"‚úÖ ƒê√£ l∆∞u vectorizer: {vec_pkl}")
    
    # L∆∞u vocabulary
    vocab = vectorizer.get_feature_names_out()
    with open(vocab_txt, 'w', encoding='utf-8') as f:
        for word in vocab:
            f.write(word + '\n')
    print(f"‚úÖ ƒê√£ l∆∞u vocabulary ({len(vocab)} t·ª´): {vocab_txt}")


# -----------------------------
# 6) PIPELINE CH√çNH
# -----------------------------
def main():
    """
    H√†m ch√≠nh: ch·∫°y to√†n b·ªô pipeline
    """
    print("="*70)
    print("PIPELINE TI·ªÄN X·ª¨ L√ù D·ªÆ LI·ªÜU SMS SPAM".center(70))
    print("="*70)
    
    # Set random seed
    random.seed(RANDOM_STATE)
    np.random.seed(RANDOM_STATE)
    
    # B∆∞·ªõc 1: ƒê·ªçc v√† l√†m s·∫°ch d·ªØ li·ªáu
    texts, labels = load_and_clean_data(RAW_CSV, STOPWORDS)
    
    # B∆∞·ªõc 2: Chia train/test
    X_train, X_test, y_train, y_test = split_train_test(
        texts, labels, 
        test_size=TEST_SIZE, 
        random_state=RANDOM_STATE
    )
    
    
    save_processed_data(X_train, y_train, X_test, y_test, TRAIN_OUT, TEST_OUT) # B∆∞·ªõc 3: L∆∞u d·ªØ li·ªáu ƒë√£ l√†m s·∫°ch
    
   
    vectorizer = create_vectorizer(vocab_size=VOCAB_SIZE, method='tfidf') # B∆∞·ªõc 4: T·∫°o v√† fit vectorizer
    X_train_vec, X_test_vec = fit_and_transform(vectorizer, X_train, X_test)
    
   
    save_vectorizer_and_vocab(vectorizer, VEC_PKL, VOCAB_TXT) # B∆∞·ªõc 5: L∆∞u vectorizer v√† vocabulary
    
    # Th·ªëng k√™ cu·ªëi c√πng
    print("\n" + "="*70)
    print(" T·ªîNG K·∫æT".center(70))
    print("="*70)
    print(f"‚úÖ T·ªïng s·ªë m·∫´u: {len(texts)}")
    print(f"‚úÖ Train set: {len(X_train)} m·∫´u")
    print(f"‚úÖ Test set: {len(X_test)} m·∫´u")
    print(f"‚úÖ Vocabulary size: {len(vectorizer.vocabulary_)}")
    print(f"‚úÖ Vector shape: ({len(X_train)}, {len(vectorizer.vocabulary_)})")
    print("\nüìÅ Files ƒë√£ t·∫°o:")
    print(f"   - {TRAIN_OUT}")
    print(f"   - {TEST_OUT}")
    print(f"   - {VEC_PKL}")
    print(f"   - {VOCAB_TXT}")
    print("\n‚ú® Pipeline ho√†n th√†nh!")
    print("="*70)
    
    return X_train_vec, X_test_vec, y_train, y_test, vectorizer



: 

In [None]:
index = 0
visited[start] = False
while (visited[end] == False):
    for x in range 

In [None]:
ds = []
item1 = tuple([1,2]); item2 = tuple([13,2])
ds.append(item1)
ds.append(item2)
visited =[]
for _ in range(n):
    