In [3]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning) # CatBoost can be verbose with warnings

import pandas as pd
import numpy as np

from sklearn.model_selection import (
    train_test_split, StratifiedKFold, RandomizedSearchCV
)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier
import time # Add time library
import os # Add os library

# Define the path to save the trained model
MODEL_FILE_PATH = 'catboost_final_model_cpu.cbm'

# Start measuring total execution time
start_time = time.time()

# 1) 데이터 로드
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: 'train.csv' not found. Please ensure the file is in the correct directory.")
    exit()

X = df.drop(columns=['id', 'shares', 'y'])
y = df['y']

# *** 파생변수 생성 시작 ***
print("Creating derived features...")
epsilon = 1e-6 # 0으로 나누기 방지

# 1. 콘텐츠 길이 대비 링크 수 비율
if 'n_tokens_content' in X.columns and 'num_hrefs' in X.columns:
    X['feat_content_to_href_ratio'] = X['n_tokens_content'] / (X['num_hrefs'] + epsilon)
else:
    print("Warning: Columns for 'feat_content_to_href_ratio' not found.")

# 2. 콘텐츠 길이 대비 이미지 수 비율
if 'n_tokens_content' in X.columns and 'num_imgs' in X.columns:
    X['feat_content_to_img_ratio'] = X['n_tokens_content'] / (X['num_imgs'] + epsilon)
else:
    print("Warning: Columns for 'feat_content_to_img_ratio' not found.")

# 3. 평균 키워드 공유 수 편차
if 'kw_avg_max' in X.columns and 'kw_avg_min' in X.columns:
    X['feat_kw_avg_spread'] = X['kw_avg_max'] - X['kw_avg_min']
else:
    print("Warning: Columns for 'feat_kw_avg_spread' not found.")

# 4. 전반적인 주관성 가중 감성
if 'global_subjectivity' in X.columns and 'global_sentiment_polarity' in X.columns:
    X['feat_global_sentiment_strength'] = X['global_subjectivity'] * X['global_sentiment_polarity']
else:
    print("Warning: Columns for 'feat_global_sentiment_strength' not found.")

# 5. 제목의 주관성 가중 감성
if 'title_subjectivity' in X.columns and 'title_sentiment_polarity' in X.columns:
    X['feat_title_sentiment_strength'] = X['title_subjectivity'] * X['title_sentiment_polarity']
else:
    print("Warning: Columns for 'feat_title_sentiment_strength' not found.")

# LDA 관련 변수들
lda_cols = ['LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04']
if all(col in X.columns for col in lda_cols):
    # 6. LDA 주제 값 중 최댓값
    X['feat_lda_max_value'] = X[lda_cols].max(axis=1)
    # 7. LDA 주제 값들의 표준편차
    X['feat_lda_std_dev'] = X[lda_cols].std(axis=1)
else:
    print(f"Warning: Not all LDA columns ({', '.join(lda_cols)}) found for derived features.")

print("Derived features created.")



# 2) 간단 전처리 (전체 데이터 기준)
# num_cols를 파생변수 생성 *후*에 정의하여 새로운 수치형 파생변수도 포함하도록 함
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = ['data_channel', 'weekday'] # These are the names of your categorical columns

# --- 수치형: median 으로 결측 채우기
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
# --- 범주형: 'missing' 문자열로 결측 채우기 (중요!)
X[cat_cols] = X[cat_cols].fillna('missing')


# 3) train/test 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(f"\nData shapes after split: TrainVal({X_trainval.shape[0]}), Test({X_test.shape[0]})")


# 4) 하이퍼파라미터 탐색 범위 정의 (Tuned)
param_dist_tuned = {
    'learning_rate':        [0.01, 0.03, 0.05, 0.07, 0.1],
    'depth':                [4, 6, 8, 10],
    'l2_leaf_reg':          [1, 3, 5, 7, 9, 12],
    'border_count':         [32, 64, 128, 254],
    'bagging_temperature':  [0, 0.5, 1.0, 1.5, 2.0],
    'random_strength':      [0.1, 0.5, 1, 2, 5],
    'colsample_bylevel':    [0.6, 0.7, 0.8, 0.9, 1.0]
}

# 5) RandomizedSearchCV 세팅
N_ITER_SEARCH = 30 # Increased from 20, adjust based on available time/resources

base_model = CatBoostClassifier(
    iterations=1000, # Max iterations for models during the search phase
    random_state=42,
    verbose=0, # Suppress output during RandomizedSearchCV's internal fits
    task_type='CPU' 
)

min_class_count = y_trainval.value_counts().min()
n_cv_splits = 5
if min_class_count < n_cv_splits:
    print(f"Warning: The smallest class in y_trainval has only {min_class_count} members, "
          f"which is less than n_splits={n_cv_splits} for StratifiedKFold. "
          f"Consider reducing n_splits or using a different CV strategy if issues arise.")

cv = StratifiedKFold(n_splits=n_cv_splits, shuffle=True, random_state=42)

print(f"\nStarting RandomizedSearchCV ({N_ITER_SEARCH} iterations) using CPU...")
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist_tuned,
    n_iter=N_ITER_SEARCH,
    scoring='roc_auc', # Focus on ROC AUC for optimization
    cv=cv,
    n_jobs=-1, 
    random_state=42,
    verbose=1, # Show progress for RandomizedSearchCV
    refit=True # Refit the best estimator on the whole X_trainval after search
)

# 6) CV 단계에서 cat_features만 전달
search.fit(
    X_trainval,
    y_trainval,
    cat_features=cat_cols
)

print("\n" + "="*50)
print("RandomizedSearchCV Results:")
print("="*50)
print("Best tuned hyperparameters found:", search.best_params_)
print(f"Best CV ROC AUC score from RandomizedSearchCV: {search.best_score_:.4f}")
print("="*50)


# 7) 최종 모델에 early stopping + eval_set 걸어 재학습
best_tuned_params = search.best_params_
print("\nParameters for Final Model Training:", best_tuned_params)

print("\nTraining final tuned model with early stopping on test set using CPU...")
final_model_tuned_with_derived = CatBoostClassifier(
    iterations=2000, # Allow more iterations; early stopping will find the optimum
    learning_rate=best_tuned_params['learning_rate'],
    depth=best_tuned_params['depth'],
    l2_leaf_reg=best_tuned_params['l2_leaf_reg'],
    border_count=best_tuned_params['border_count'],
    bagging_temperature=best_tuned_params['bagging_temperature'],
    random_strength=best_tuned_params['random_strength'],
    colsample_bylevel=best_tuned_params['colsample_bylevel'],
    eval_metric='AUC',
    early_stopping_rounds=50,
    use_best_model=True,
    random_state=42,
    verbose=100, 
    task_type='CPU' 
)

final_model_tuned_with_derived.fit(
    X_trainval,
    y_trainval,
    cat_features=cat_cols,
    eval_set=[(X_test, y_test)] # Use X_test, y_test for early stopping (as in original code)
)

print(f"\nBest iteration found by early stopping for the final model: {final_model_tuned_with_derived.get_best_iteration()}")


# 8) Feature Importance Analysis
print("\n" + "="*50)
print("Feature Importance Analysis:")
print("="*50)
try:
    feature_importances = final_model_tuned_with_derived.get_feature_importance(prettified=True)
    print("\nTop 20 Feature Importances:")
    print(feature_importances.head(20))

    feature_importances.to_csv("feature_importances_with_derived_cpu_original_params.csv", index=False)
    print("\nFeature importances saved to 'feature_importances_with_derived_cpu_original_params.csv'")

except Exception as e:
    print(f"Could not retrieve feature importance: {e}")

print("\nConsider reviewing feature importances. Low importance features might be removed,")
print("and high importance features could inspire new derived features in the next iteration.")
print("="*50)


# 9) Hold-out Test 성능 평가
print("\n" + "="*50)
print("Tuned Model with Derived Features - FINAL HOLD-OUT TEST PERFORMANCE (CPU-trained):")
print("="*50)

y_pred_test = final_model_tuned_with_derived.predict(X_test)
y_prob_test = final_model_tuned_with_derived.predict_proba(X_test)[:, 1]

acc_test = accuracy_score(y_test, y_pred_test)
f1_test  = f1_score(y_test, y_pred_test)
auc_test = roc_auc_score(y_test, y_prob_test)

# Composite score as defined in your original code
comp_test = (acc_test + f1_test + auc_test) / 3

print(f"Accuracy : {acc_test:.4f}")
print(f"F1 Score : {f1_test:.4f}")
print(f"ROC AUC  : {auc_test:.4f}")
print(f"Composite : {comp_test:.4f}")
print("="*50)


# 10) Save the trained model
print(f"\nSaving the trained model to '{MODEL_FILE_PATH}'...")
try:
    final_model_tuned_with_derived.save_model(MODEL_FILE_PATH)
    print("Model saved successfully.")
except Exception as e:
    print(f"Error saving model: {e}")


# Calculate and print total execution time
end_time = time.time()
total_time_seconds = end_time - start_time
hours = int(total_time_seconds // 3600)
minutes = int((total_time_seconds % 3600) // 60)
seconds = int(total_time_seconds % 60)

print(f"\nTotal script execution time: {hours}h {minutes}m {seconds}s")

Creating derived features...
Derived features created.

Data shapes after split: TrainVal(17760), Test(4440)

Starting RandomizedSearchCV (30 iterations) using CPU...
Fitting 5 folds for each of 30 candidates, totalling 150 fits

RandomizedSearchCV Results:
Best tuned hyperparameters found: {'random_strength': 0.5, 'learning_rate': 0.01, 'l2_leaf_reg': 1, 'depth': 8, 'colsample_bylevel': 0.9, 'border_count': 128, 'bagging_temperature': 2.0}
Best CV ROC AUC score from RandomizedSearchCV: 0.7186

Parameters for Final Model Training: {'random_strength': 0.5, 'learning_rate': 0.01, 'l2_leaf_reg': 1, 'depth': 8, 'colsample_bylevel': 0.9, 'border_count': 128, 'bagging_temperature': 2.0}

Training final tuned model with early stopping on test set using CPU...
0:	test: 0.6835084	best: 0.6835084 (0)	total: 10.9ms	remaining: 21.8s
100:	test: 0.7143633	best: 0.7143633 (100)	total: 987ms	remaining: 18.6s
200:	test: 0.7205572	best: 0.7205627 (198)	total: 1.98s	remaining: 17.7s
300:	test: 0.7233573	

In [5]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import os

# Define the path to the saved model file and the submission file
MODEL_FILE_PATH = 'catboost_final_model_cpu.cbm' 
SUBMISSION_FILE_PATH = 'prediction.csv'         
TEST_DATA_PATH = 'test.csv' 


print(f"Looking for trained model file at '{MODEL_FILE_PATH}'...")
if not os.path.exists(MODEL_FILE_PATH):
    print(f"Error: Trained model file not found at '{MODEL_FILE_PATH}'.")
    print("Please run the training script ('train_cpu_save.py') first to train and save the model.")
    exit()

print(f"Loading trained model from '{MODEL_FILE_PATH}'...")
# Create an empty CatBoostClassifier object and load the model into it.
# Explicitly specify task_type='CPU' for prediction on CPU.
model = CatBoostClassifier(task_type='CPU')
try:
    model.load_model(MODEL_FILE_PATH)
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    exit()


# 1) Load the test data
print(f"\nLoading test data from '{TEST_DATA_PATH}'...")
try:
    df_test = pd.read_csv(TEST_DATA_PATH)
    print("Test data loaded successfully.")
except FileNotFoundError:
    print(f"Error: '{TEST_DATA_PATH}' not found.")
    exit()

# Store the original IDs for the submission file
test_ids = df_test['id']

# Drop the id column from the features
X_test = df_test.drop(columns=['id'])


# *** Apply the SAME Derived Feature Creation as in the training script ***
print("Creating derived features for test data (must match training)...")
epsilon = 1e-6 # Prevent division by zero (same as training)

if 'n_tokens_content' in X_test.columns and 'num_hrefs' in X_test.columns:
    X_test['feat_content_to_href_ratio'] = X_test['n_tokens_content'] / (X_test['num_hrefs'] + epsilon)
else:
    print("Warning: Columns for 'feat_content_to_href_ratio' not found in test data.")

if 'n_tokens_content' in X_test.columns and 'num_imgs' in X_test.columns:
    X_test['feat_content_to_img_ratio'] = X_test['n_tokens_content'] / (X_test['num_imgs'] + epsilon)
else:
    print("Warning: Columns for 'feat_content_to_img_ratio' not found in test data.")

if 'kw_avg_max' in X_test.columns and 'kw_avg_min' in X_test.columns:
    X_test['feat_kw_avg_spread'] = X_test['kw_avg_max'] - X_test['kw_avg_min']
else:
    print("Warning: Columns for 'feat_kw_avg_spread' not found in test data.")

if 'global_subjectivity' in X_test.columns and 'global_sentiment_polarity' in X_test.columns:
    X_test['feat_global_sentiment_strength'] = X_test['global_subjectivity'] * X_test['global_sentiment_polarity']
else:
    print("Warning: Columns for 'feat_global_sentiment_strength' not found in test data.")

if 'title_subjectivity' in X_test.columns and 'title_sentiment_polarity' in X_test.columns:
    X_test['feat_title_sentiment_strength'] = X_test['title_subjectivity'] * X_test['title_sentiment_polarity']
else:
    print("Warning: Columns for 'feat_title_sentiment_strength' not found in test data.")

lda_cols = ['LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04']
if all(col in X_test.columns for col in lda_cols):
    X_test['feat_lda_max_value'] = X_test[lda_cols].max(axis=1)
    X_test['feat_lda_std_dev'] = X_test[lda_cols].std(axis=1)
else:
    print(f"Warning: Not all LDA columns ({', '.join(lda_cols)}) found for derived features in test data.")

print("Derived features created for test data.")


# *** Apply the SAME Preprocessing (Missing Value Imputation) ***
print("Applying preprocessing to test data (must match training)...")

num_cols_test = X_test.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_test = ['data_channel', 'weekday']

# --- Numeric: Fill missing with median of the test set (Simplification) ---
X_test[num_cols_test] = X_test[num_cols_test].fillna(X_test[num_cols_test].median())
# --- Categorical: Fill missing with 'missing' string (same as training) ---
X_test[cat_cols_test] = X_test[cat_cols_test].fillna('missing')

print("Preprocessing applied to test data.")


# --- Ensure Column Order Matches Trained Model ---
train_features_order = model.feature_names_
print(f"\nEnsuring test data columns match the trained model's {len(train_features_order)} features...")

try:
    X_test = X_test[train_features_order]
    print("Test data column order matches training data.")
except KeyError as e:
    print(f"Error: Column mismatch between test data and trained model features: {e}")
    print("Please ensure test data has all necessary features created during training.")
    exit()


# 5) Make predictions on the processed test data
print("\nMaking predictions on the test data using CPU...")

# Predict binary classes (0 or 1)
test_predictions_binary = model.predict(X_test)

# Predict probabilities for the positive class (assuming 1 is the positive class).
# predict_proba returns shape (n_samples, n_classes). We need the second column (index 1).
test_probabilities = model.predict_proba(X_test)[:, 1]

print("Predictions completed.")


# 6) Create submission file with id, y_predict, and y_prob
print(f"Creating prediction file '{SUBMISSION_FILE_PATH}' with id, y_predict, y_prob...")
prediction_df = pd.DataFrame({
    'id': test_ids,
    'y_predict': test_predictions_binary,
    'y_prob': test_probabilities
})

# Save the prediction file
prediction_df.to_csv(SUBMISSION_FILE_PATH, index=False)

print(f"\nPrediction file '{SUBMISSION_FILE_PATH}' created successfully.")
print(f"Predictions saved for {len(prediction_df)} rows.")

Looking for trained model file at 'catboost_final_model_cpu.cbm'...
Loading trained model from 'catboost_final_model_cpu.cbm'...
Model loaded successfully.

Loading test data from 'test.csv'...
Test data loaded successfully.
Creating derived features for test data (must match training)...
Derived features created for test data.
Applying preprocessing to test data (must match training)...
Preprocessing applied to test data.

Ensuring test data columns match the trained model's 53 features...
Test data column order matches training data.

Making predictions on the test data using CPU...
Predictions completed.
Creating prediction file 'prediction.csv' with id, y_predict, y_prob...

Prediction file 'prediction.csv' created successfully.
Predictions saved for 9515 rows.
