In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [None]:
df_full_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

# EDA

**Column names formating**

In [None]:
# full training dataset preparation
# lowercase column name
df_full_train.columns = df_full_train.columns.str.lower().str.replace(' ', '_')
df_test.columns = df_test.columns.str.lower().str.replace(' ', '_')

**drop "id" column**

In [None]:
df_full_train = df_full_train.drop(columns='id')
df_test = df_test.drop(columns='id')

**Missing check**

In [None]:
missing_columns_full_train = df_full_train.isnull().sum()
print('Missing train column ->',missing_columns_full_train[missing_columns_full_train > 0])
missing_columns_test = df_test.isnull().sum()
print('Missing test column->',missing_columns_test[missing_columns_test > 0])

**duplicated check**

In [None]:
if df_full_train.duplicated().any():
    print('---Duplicates exist---')
else:
    print('---No duplicates---')

**dtypes check**

In [None]:
df_full_train.info(verbose=False)

In [None]:
df_test.info(verbose=False)

In [None]:
df_full_train.head(10)

In [None]:
binary_features = df_full_train.drop(columns='smoking').nunique()[df_full_train.nunique() < 10].index.tolist()

continuous_features = df_full_train.drop(columns='smoking').nunique()[df_full_train.nunique() >= 10].index.tolist()

**target variable insight**

In [None]:
y_full_train = df_full_train['smoking']

y_full_train.value_counts(normalize=True).round(2)

**plot every feature's distribution**

In [None]:
df_full_train.hist(bins=20, figsize=(20,20), color='lightcoral')
plt.show()

In [None]:
df_full_train.boxplot(figsize=(20,20), grid=False, vert=False)
plt.show()

**Correlation analysis**

In [None]:
# 计算相关性矩阵
corr_matrix = df_full_train.corr()

# 可视化（热力图）
plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, vmin=-1,vmax=1 , fmt=".2f",)
plt.title("Correlation Matrix")
plt.show()


In [None]:
corr_matrix[abs(corr_matrix) >= 0.8]

**drop high corrleaton features**

In [None]:
# delete features with high corrleation (keep one, delete another)
# df_full_train = df_full_train.drop(columns=['waist(cm)', 'cholesterol'])
# df_test= df_test.drop(columns=['waist(cm)', 'cholesterol'])

In [None]:
# new correlation matrix
corr_matrix = df_full_train.corr()

# corrleation with target variable
df_corr = corr_matrix['smoking'].to_frame(name='CORR').drop(index='smoking')

df_corr

In [None]:
# mutual info function
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, df_full_train.smoking)

In [None]:
df_mi = df_full_train.drop(columns='smoking').apply(calculate_mi)
df_mi = df_mi.to_frame(name='MI')

df_mi

In [None]:
df_corr_mi = pd.concat([df_corr, df_mi], axis=1)

df_corr_mi

# Adversarial Validation

In [None]:
av_train = df_full_train.drop(columns='smoking')
av_test = df_test

av_train['belong'] = 0
av_test['belong'] = 1

av_full = pd.concat([av_train, av_test], axis=0)

In [None]:
# 准备X和y
av_X = av_full.drop(columns='belong')
av_y = av_full['belong']

# train_test_split（用于评估对抗模型）
av_X_train, av_X_val, av_y_train, av_y_val = train_test_split(
    av_X, av_y, test_size=0.2, random_state=1)

assert len(av_X) == len(av_X_train) + len(av_X_val)
assert len(av_y) == len(av_y_train) + len(av_y_val)

In [None]:
# 使用随机森林
print("\n=== 随机森林对抗模型 ===")
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model.fit(av_X_train, av_y_train)

# 评估
rf_pred = rf_model.predict_proba(av_X_val)[:, 1]
rf_auc = roc_auc_score(av_y_val, rf_pred)
print(f"AUC: {rf_auc:.4f}")

print("\n=== 结果解读 ===")
if rf_auc < 0.55:
    print("✅ 训练集和测试集分布一致(AUC接近0.5)")
elif rf_auc > 0.7:
    print("⚠️ 警告：训练集和测试集分布差异显著！")
    print("可能原因：")
    print("- 数据来自不同时间段/来源")
    print("- 测试集包含训练集未见的特征值")
else:
    print("🔍 分布有轻微差异，建议检查特征重要性")

---------------

# Data preparation

In [None]:
# set random seed num
seed=1

In [None]:
X_full_train = df_full_train.drop(columns='smoking')
y_full_train = df_full_train['smoking']

X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.2, random_state=seed)

X_test = df_test

assert len(X_full_train) == len(X_train) + len(X_val)
assert len(y_full_train) == len(y_train) + len(y_val)

# Modelling

## Base Model

In [None]:
# Base model (logistic regression) + 5-fold
nfolds = 5
kfold = KFold(n_splits=nfolds, shuffle=True, random_state=seed)

aucs = []
praucs = []

for train_idx, val_idx in kfold.split(df_full_train):
    # get k-fold train & test
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    
    # get X, y
    X_train = df_train.drop(columns='smoking')
    X_val = df_val.drop(columns='smoking')

    y_train = df_train.smoking.values
    y_val = df_val.smoking.values
    
    # Create  model
    model = LogisticRegression(solver='liblinear', class_weight='balanced' ,random_state=seed)
    model.fit(X_train, y_train)
    
    # predict val
    y_pred = model.predict_proba(X_val)[:,1]
    
    # evaluation
    auc = roc_auc_score(y_val, y_pred)
    prauc = average_precision_score(y_val, y_pred)
    
    praucs.append(prauc)
    aucs.append(auc)

print('Logistic - ROC-auc = %0.4f ± %0.4f' % (np.mean(aucs), np.std(aucs)))
print('Logistic - PR-auc = %0.4f ± %0.4f' % (np.mean(praucs), np.std(praucs)))

## Advanced Model

In [None]:
# Random forest - 5 fold (n_estimator | max_depth | min_samples_split | min_sample_leaf)
models = [
    ("Random Forest", RandomForestClassifier(random_state=seed))
]

# CV（using ROC-AUC score）
for name, model in models:
    roc_auc_scores = cross_val_score(model, X_full_train, y_full_train, cv=5, scoring='roc_auc')
    print('%s - CV ROC-auc Score: %.4f ± %.4f' %(name, np.mean(roc_auc_scores), np.std(roc_auc_scores)))
    pr_auc_scores = cross_val_score(model, X_full_train, y_full_train, cv=5, scoring='average_precision')
    print('%s - CV PR-auc Score: %.4f ± %.4f' %(name, np.mean(pr_auc_scores), np.std(pr_auc_scores)))
    


In [None]:
# XGBoost - 5 fold
aucs = []
praucs = []

for t, v in kfold.split(df_full_train):
    df_train = df_full_train.iloc[t]
    df_val = df_full_train.iloc[v]
    
    X_train = df_train.drop(columns='smoking')
    X_val = df_val.drop(columns='smoking')
    
    y_train = df_train.smoking.values
    y_val = df_val.smoking.values
    
    model_xgb = xgb.XGBClassifier(objective='binary:logistic', random_state=seed)
    
    model_xgb.fit(X_train, y_train)

    y_pred = model_xgb.predict_proba(X_val)[:,1]

    # evaluation
    auc = roc_auc_score(y_val, y_pred)
    prauc = average_precision_score(y_val, y_pred)
    
    praucs.append(prauc)
    aucs.append(auc)

print('XGBoost - ROC-auc = %0.4f ± %0.4f' % (np.mean(aucs), np.std(aucs)))
print('XGBoost - PR-auc = %0.4f ± %0.4f' % (np.mean(praucs), np.std(praucs)))


## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

np.random.seed(seed=seed)

# 超参数分布
param_dist = {
    'max_depth': randint(3, 10),
    'eta': uniform(0.01, 0.3),
    'min_child_weight': randint(1, 30)
}

# 随机搜索最佳参数,并重新训练
random_search = RandomizedSearchCV(
    estimator= xgb.XGBClassifier(objective='binary:logistic', random_state=seed),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='average_precision',
    refit = True
)

random_search.fit(X_full_train, y_full_train)

# 输出最佳参数
print("Best Parameters:", random_search.best_params_)
print("Best PR-AUC Score: ", random_search.best_score_)
best_model_xgb = random_search.best_estimator_


In [None]:
# XGBoost
xgb.plot_importance(best_model_xgb)
plt.show()

## Model Testing

In [None]:
# Final model testing
models = [
    # ("Logistic Regression", LogisticRegression(solver='liblinear', random_state=seed)),
    # ("Random Forest", RandomForestClassifier(random_state=seed)),
    ("XGBoost", best_model_xgb)
]

for name, model in models:
    # model.fit(X_full_train, y_full_train)
    # full tain dataset
    y_pred = model.predict_proba(X_full_train)[:,1]
    print('%s - full train score: %.4f' %(name, average_precision_score(y_full_train, y_pred)) )
    # test dataset
    y_test_pred = model.predict_proba(X_test)[:,1]
    sample_submission['smoking'] = y_test_pred
    sample_submission.to_csv(f'submission{name}.csv', index=False)

In [None]:
def kfold_eval(model, full_train, n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    aucs = []
    for train_idx, val_idx in kfold.split(full_train):
            df_train = df_full_train.iloc[train_idx]
            df_val = df_full_train.iloc[val_idx]
            
            X_train = df_train.drop(columns='smoking')
            X_val = df_val.drop(columns='smoking')
            
            y_train = df_train.smoking.values
            y_val = df_val.smoking.values
            
            model.fit(X_train, y_train)

            y_pred = model.predict_proba(X_val)[:,1]

            # evaluation
            auc = roc_auc_score(y_val, y_pred)
            aucs.append(auc)
    return np.mean(aucs), np.std(aucs)

In [None]:
# import sklearn
# print(sklearn.__version__)  # 保存此版本号

1.7.0


In [28]:
! pipenv install numpy pandas scikit-learn==1.7.0 flask gunicorn xgboost==3.0.4

[1mCreating a Pipfile for this project[0m[1;33m...[0m
[1mPipfile.lock not found, creating[0m[1;33m...[0m
Locking  dependencies[33m...[0m
Locking  dependencies[33m...[0m
[1mUpdated Pipfile.lock [0m
[1m([0m[1m702ad05de9bc9de99a4807c8dde1686f31e0041d7b5f6f6b74861195a52110f5[0m[1m)[0m[1m![0m
To activate this project's virtualenv, run [33mpipenv shell[0m.
Alternatively, run a command inside the virtualenv with [33mpipenv run[0m.
[1;32mInstalling numpy...[0m
✔ Installation Succeeded
[1;32mInstalling pandas...[0m
✔ Installation Succeeded
[1;32mInstalling scikit-learn==1.7.0...[0m
✔ Installation Succeeded
[1;32mInstalling flask...[0m
✔ Installation Succeeded
[1;32mInstalling gunicorn...[0m
✔ Installation Succeeded
[1;32mInstalling xgboost==3.0.4...[0m
✔ Installation Succeeded
To activate this project's virtualenv, run [33mpipenv shell[0m.
Alternatively, run a command inside the virtualenv with [33mpipenv run[0m.
[1mInstalling dependencies from Pipfile

In [29]:
! gunicorn -b 0.0.0.0:9696 predict:app

[2025-09-06 05:01:30 +0000] [37061] [INFO] Starting gunicorn 23.0.0
[2025-09-06 05:01:30 +0000] [37061] [INFO] Listening at: http://0.0.0.0:9696 (37061)
[2025-09-06 05:01:30 +0000] [37061] [INFO] Using worker: sync
[2025-09-06 05:01:30 +0000] [37063] [INFO] Booting worker with pid: 37063
[2025-09-06 05:01:36 +0000] [37061] [INFO] Handling signal: int
^C
[2025-09-06 05:01:36 +0000] [37063] [INFO] Worker exiting (pid: 37063)


In [31]:
! docker build -t mid-project .

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[?25h[1A[0G[?25l[+] Building 0.2s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 326B                                       0.0s
[0m => [internal] load metadata for docker.io/library/python:3.12.1-slim      0.1s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.4s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 326B                                       0.0s
[0m => [internal] load metadata for docker.io/library/python:3.12.1-slim      0.3s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.5s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile     

In [32]:
! docker images

REPOSITORY    TAG       IMAGE ID       CREATED          SIZE
mid-project   latest    24c07bcdf35b   37 seconds ago   1.6GB


In [30]:
! docker rmi -f $(docker images -aq)

Untagged: mid-project:latest
Deleted: sha256:ffe261ef95fe94ea3278bd4a99da6dfe843504f8fe9ad131039a38f98d4c1540


In [33]:
! docker run -it --rm -p 9696:9696  mid-project:latest

[2025-09-06 05:03:37 +0000] [1] [INFO] Starting gunicorn 23.0.0
[2025-09-06 05:03:37 +0000] [1] [INFO] Listening at: http://0.0.0.0:9696 (1)
[2025-09-06 05:03:37 +0000] [1] [INFO] Using worker: sync
[2025-09-06 05:03:37 +0000] [6] [INFO] Booting worker with pid: 6
[2025-09-06 05:04:25 +0000] [1] [INFO] Handling signal: int
^C
[2025-09-06 05:04:26 +0000] [6] [INFO] Worker exiting (pid: 6)
