## I. Read Data (Only Statistic Features)

In [30]:
import numpy as np
import pandas as pd
import os
import sys

# Find project root directory automatically
def find_project_root():
    current_dir = os.getcwd()
    while current_dir != '/':
        if any(marker in os.listdir(current_dir) for marker in ['.gitignore', 'requirements.txt', 'setup.py', 'pyproject.toml']):
            return current_dir
        current_dir = os.path.dirname(current_dir)
    return os.getcwd()  # fallback to current directory

project_root = find_project_root()

train_stat_df_path = os.path.join(project_root, 'data/train_statistic_features.csv')
val_stat_df_path = os.path.join(project_root, 'data/val_statistic_features.csv')
train_tfidf_df_path = os.path.join(project_root, 'data/train_tfidf_features.csv')
val_tfidf_df_path = os.path.join(project_root, 'data/val_tfidf_features.csv')
train_bow_df_path = os.path.join(project_root, 'data/train_bow_features.csv')
val_bow_df_path = os.path.join(project_root, 'data/val_bow_features.csv')


train_stat_df = pd.read_csv(train_stat_df_path)
val_stat_df = pd.read_csv(val_stat_df_path)




print("Shape Train:", train_stat_df.shape)
print("Shape Validation:", val_stat_df.shape)
print("Columns Train:", train_stat_df.columns)

Shape Train: (152, 52)
Shape Validation: (19, 52)
Columns Train: Index(['label', 'file1_char_count', 'file1_word_count', 'file1_sentence_count',
       'file1_avg_sentence_length', 'file1_english_word_ratio',
       'file1_cyrillic_count', 'file1_arabic_count', 'file1_chinese_count',
       'file1_script_diversity', 'file1_unicode_control_chars',
       'file1_number_count', 'file1_uppercase_word_count',
       'file1_repetition_score', 'file1_perplexity_score', 'file1_ttr_ratio',
       'file2_char_count', 'file2_word_count', 'file2_sentence_count',
       'file2_avg_sentence_length', 'file2_english_word_ratio',
       'file2_cyrillic_count', 'file2_arabic_count', 'file2_chinese_count',
       'file2_script_diversity', 'file2_unicode_control_chars',
       'file2_number_count', 'file2_uppercase_word_count',
       'file2_repetition_score', 'file2_perplexity_score', 'file2_ttr_ratio',
       'diff_char_count', 'ratio_char_count', 'diff_word_count',
       'ratio_word_count', 'diff_sent

Có thể đọc mô tả các features ở [đây](../data/README.md#processed-features-information)

In [31]:
X_train, y_train = train_stat_df.drop(columns=["label"]), train_stat_df["label"]
X_val, y_val = val_stat_df.drop(columns=["label"]), val_stat_df["label"]

## 

## Modeling

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from IPython.display import display

# Initialize models
models = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'SVC': SVC(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

# Initialize scaler
scaler = StandardScaler()

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train and evaluate models
results = {}

for model_name, model in models.items():
    # print(f"\n{'='*50}")
    # print(f"Training {model_name}")
    # print(f"{'='*50}")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_val_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    
    # Store results
    results[model_name] = {
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    # print(f"Accuracy: {accuracy:.4f}")

# Convert results to DataFrame and sort by accuracy
results_df = pd.DataFrame.from_dict(
    {model: {'accuracy': result['accuracy']} for model, result in results.items()}, 
    orient='index'
).sort_values('accuracy', ascending=False)

# Summary of results
print(f"\n{'='*60}")
print("SUMMARY OF MODEL PERFORMANCE (Sorted by Accuracy)")
print(f"{'='*60}")
display(results_df)


SUMMARY OF MODEL PERFORMANCE (Sorted by Accuracy)


Unnamed: 0,accuracy
LogisticRegression,0.842105
SVC,0.842105
RandomForest,0.842105
KNN,0.789474


## II. Data with TFIDF Features

In [33]:
train_tfidf_df = pd.read_csv(train_tfidf_df_path)
val_tfidf_df = pd.read_csv(val_tfidf_df_path)


full_train_df = pd.concat([train_stat_df, train_tfidf_df], axis=1)
full_val_df = pd.concat([val_stat_df, val_tfidf_df], axis=1)

X_train, y_train = full_train_df.drop(columns=["label"]), full_train_df["label"]
X_val, y_val = full_val_df.drop(columns=["label"]), full_val_df["label"]

print(X_train.shape, X_val.shape)

(152, 101) (19, 101)


## II. Data with BOW Features

In [34]:
train_bow_df = pd.read_csv(train_bow_df_path)
val_bow_df = pd.read_csv(val_bow_df_path)

full_train_df = pd.concat([train_stat_df, train_bow_df], axis=1)
full_val_df = pd.concat([val_stat_df, val_bow_df], axis=1)

X_train, y_train = full_train_df.drop(columns=["label"]), full_train_df["label"]
X_val, y_val = full_val_df.drop(columns=["label"]), full_val_df["label"]

print(X_train.shape, X_val.shape)

(152, 101) (19, 101)


## Data with Pertained Model Embedding