## Hướng dẫn làm bài:
1. Sử dụng Pipeline để train mô hình
2. Lưu mô hình sau khi train và các thông tin mô tả vào file model.pkl
3. Nộp file .ipynb lên LMS, không cần nén
4. Copy file model.pkl vào thư mục T:\trained\

In [None]:
# Khai báo thông tin sinh viên
Lop = "DHDTMT18A"
Nhom = "3"
MSSV = "22659651"
HoTen = "Nguyen Phu Quy"
SoMay = 20

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier
import joblib
import warnings

warnings.filterwarnings("ignore")

df = pd.read_csv("dataset_train.csv")
df[['Vmag', 'Plx', 'B-V']] = df[['Vmag', 'Plx', 'B-V']].apply(pd.to_numeric, errors='coerce')
df = df.dropna(subset=['Vmag', 'Plx', 'B-V', 'SpType'])
df = df[df['Plx'] > 0]
df['AbsMag'] = df['Vmag'] + 5 * np.log10(df['Plx'] / 1000) + 5
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=['AbsMag', 'B-V'])
df['ColorIndex'] = df['Vmag'] - df['B-V']
df['BV_squared'] = df['B-V'] ** 2
df['AbsMag_log'] = df['AbsMag'].apply(lambda x: np.log1p(x) if x > -1 else np.nan)
df['SpectralClass'] = df['SpType'].str.extract(r'([OBAFGKM])')
df['LuminosityClass'] = df['SpType'].str.extract(r'(Ia|Ib|II|III|IV|V)')
df = df.dropna(subset=['SpectralClass', 'LuminosityClass'])
le_spectral = LabelEncoder()
le_luminosity = LabelEncoder()
df['SpectralClass_enc'] = le_spectral.fit_transform(df['SpectralClass'])
df['LuminosityClass_enc'] = le_luminosity.fit_transform(df['LuminosityClass'])

def classify_star_type(sptype):
    sptype = str(sptype)
    if any(x in sptype for x in ['III', 'Ia', 'I']):
        return 'Giant'
    elif any(x in sptype for x in ['V', 'IV']):
        return 'Dwarf'
    return 'Unknown'

df['Star type'] = df['SpType'].apply(classify_star_type)
df = df[df['Star type'] != 'Unknown']
features = ['AbsMag', 'B-V', 'ColorIndex', 'BV_squared', 'AbsMag_log', 'SpectralClass_enc', 'LuminosityClass_enc']
df = df.dropna(subset=features)
X = df[features]
y = df['Star type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('classifier', LGBMClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.002,
        class_weight='balanced',
        reg_alpha=2.0,
        reg_lambda=2.0,
        colsample_bytree=0.3,
        verbosity=-1,
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_weighted')
print("Cross-validated F1 scores:", cv_scores)
print("Mean F1 score:", cv_scores.mean())

model_info = {
    'model': pipeline,
    'features': features,
    'classes': pipeline.named_steps['classifier'].classes_.tolist()
}
joblib.dump(model_info, 'model.pkl')

Confusion Matrix:
 [[3415   47]
 [ 360 3418]]

Classification Report:
               precision    recall  f1-score   support

       Dwarf       0.90      0.99      0.94      3462
       Giant       0.99      0.90      0.94      3778

    accuracy                           0.94      7240
   macro avg       0.95      0.95      0.94      7240
weighted avg       0.95      0.94      0.94      7240

Cross-validated F1 scores: [0.94861234 0.9432115  0.95427731 0.95068605 0.94085363]
Mean F1 score: 0.9475281648166043


['model.pkl']