In [1]:
import sys
import os
import numpy as np

# --- SETUP PATH & IMPORT ---
# Tự động tìm thư mục src để import
current_dir = os.getcwd()
path_option_1 = os.path.join(current_dir, 'src') # Nếu chạy từ root
path_option_2 = os.path.abspath(os.path.join(current_dir, '..', 'src')) # Nếu chạy từ notebooks

if os.path.exists(os.path.join(path_option_1, 'data_processing.py')):
    sys.path.append(path_option_1)
elif os.path.exists(os.path.join(path_option_2, 'data_processing.py')):
    sys.path.append(path_option_2)

from data_processing import load_data, clean_experience, clean_company_size, clean_education_level, min_max_scale



In [2]:
# --- 1. LOAD DATA ---
# Tự động tìm file csv
csv_path = 'data/raw/aug_train.csv'
if not os.path.exists(csv_path):
    csv_path = '../data/raw/aug_train.csv'

print(f"Loading data from: {csv_path}")
raw_data = load_data(csv_path)



Loading data from: ../data/raw/aug_train.csv


In [3]:
# --- 2. FEATURE ENGINEERING & CLEANING ---
# Chúng ta sẽ xây dựng ma trận X (features) và y (target)
# Số lượng mẫu
m = raw_data.shape[0]
print(f"Total samples: {m}")

# Khởi tạo danh sách các feature đã xử lý
features_list = []
feature_names = []

# --- Cột 1: City (city_103 -> 103) ---
# Cắt bỏ chuỗi 'city_', lấy số đuôi
city_col = raw_data['city'].astype(str)
# Dùng char module để replace và slice
city_ids = np.char.replace(city_col, 'city_', '')
# Chuyển về float, lỗi/missing -> 0
try:
    city_ids = city_ids.astype(float)
except:
    city_ids = np.zeros(m) 
features_list.append(city_ids)
feature_names.append('city_code')

# --- Cột 2: City Development Index (Giữ nguyên) ---
cdi = raw_data['city_development_index']
features_list.append(cdi)
feature_names.append('city_dev_index')

# --- Cột 3: Gender (Categorical -> Label Encode) ---
# Male: 1, Female: 0, Other/Nan: -1
gender_col = raw_data['gender'].astype(str)
gender_num = np.full(m, -1.0)
gender_num[gender_col == 'Male'] = 1.0
gender_num[gender_col == 'Female'] = 0.0
features_list.append(gender_num)
feature_names.append('gender')

# --- Cột 4: Relevent Experience (Binary) ---
# Has relevent experience: 1, No: 0
rex_col = raw_data['relevent_experience'].astype(str)
rex_num = (rex_col == 'Has relevent experience').astype(float)
features_list.append(rex_num)
feature_names.append('relevent_experience')

# --- Cột 5: Enrolled University (Categorical -> One-Hot fake/Label) ---
# no_enrollment: 0, Full time: 1, Part time: 2, Nan: -1
uni_col = raw_data['enrolled_university'].astype(str)
uni_num = np.full(m, -1.0)
uni_num[uni_col == 'no_enrollment'] = 0.0
uni_num[uni_col == 'Full time course'] = 1.0
uni_num[uni_col == 'Part time course'] = 2.0
features_list.append(uni_num)
feature_names.append('enrolled_university')

# --- Cột 6: Education Level (Ordinal - Dùng hàm đã viết) ---
edu_num = clean_education_level(raw_data['education_level'])
features_list.append(edu_num)
feature_names.append('education_level')

# --- Cột 7: Major Discipline (Label Encode đơn giản) ---
# STEM: 1, Others: 0
major_col = raw_data['major_discipline'].astype(str)
major_num = (major_col == 'STEM').astype(float)
features_list.append(major_num)
feature_names.append('is_stem')

# --- Cột 8: Experience (Dùng hàm đã viết) ---
exp_num = clean_experience(raw_data['experience'])
# Fill missing experience bằng mean
exp_mean = np.nanmean(exp_num)
exp_num[np.isnan(exp_num)] = exp_mean
# Normalize Experience
exp_num = min_max_scale(exp_num)
features_list.append(exp_num)
feature_names.append('experience')

# --- Cột 9: Company Size (Ordinal - Dùng hàm đã viết) ---
size_num = clean_company_size(raw_data['company_size'])
features_list.append(size_num)
feature_names.append('company_size')

# --- Cột 10: Company Type (Pvt Ltd là phổ biến nhất -> 1, else 0) ---
type_col = raw_data['company_type'].astype(str)
type_num = (type_col == 'Pvt Ltd').astype(float)
features_list.append(type_num)
feature_names.append('is_pvt_ltd')

# --- Cột 11: Last New Job (Ordinal) ---
# never: 0, 1:1, ... >4: 5
job_col = raw_data['last_new_job'].astype(str)
job_num = np.zeros(m)
job_num[job_col == '1'] = 1
job_num[job_col == '2'] = 2
job_num[job_col == '3'] = 3
job_num[job_col == '4'] = 4
job_num[job_col == '>4'] = 5
job_num[job_col == 'never'] = 0
# nan để 0 (assume never)
features_list.append(job_num)
feature_names.append('last_new_job')

# --- Cột 12: Training Hours (Numerical -> MinMax) ---
hours = raw_data['training_hours']
hours_scaled = min_max_scale(hours)
features_list.append(hours_scaled)
feature_names.append('training_hours')

# --- TỔNG HỢP DỮ LIỆU (STACKING) ---
# X shape: (samples, features)
X = np.column_stack(features_list)
y = raw_data['target']

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")
print("Features:", feature_names)





Total samples: 19158
Shape of X: (19158, 12)
Shape of y: (19158,)
Features: ['city_code', 'city_dev_index', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'is_stem', 'experience', 'company_size', 'is_pvt_ltd', 'last_new_job', 'training_hours']


In [4]:
# --- 3. SPLIT TRAIN/TEST (Thủ công - Không dùng sklearn train_test_split) ---
# Set seed để tái lập kết quả
np.random.seed(42)

# Tạo mảng indices ngẫu nhiên
indices = np.arange(m)
np.random.shuffle(indices)

# Tỷ lệ split: 80% Train, 20% Test
test_size = 0.2
split_idx = int(m * (1 - test_size))

train_indices = indices[:split_idx]
test_indices = indices[split_idx:]

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

print(f"Train size: {X_train.shape[0]}")
print(f"Test size: {X_test.shape[0]}")

Train size: 15326
Test size: 3832


In [5]:
# --- 4. SAVE PROCESSED DATA (Lưu file .npy) ---
output_dir = 'data/processed'
if not os.path.exists(output_dir):
    output_dir = '../data/processed'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

print("\n✅ Dữ liệu đã được xử lý và lưu thành công!")


✅ Dữ liệu đã được xử lý và lưu thành công!
