In [None]:
import pandas as pd
from xgboost import XGBClassifier

In [20]:
df = pd.read_csv('train.csv')
final = pd.read_csv('test.csv')

def simplify_contact(x):
    if x == 'unknown':
        return 'unknown'
    else:
        return 'known_contact'

In [21]:
def first_trans(df):
    df = pd.get_dummies(df, columns=['job'], prefix='job') #has unknown
    df = pd.get_dummies(df, columns=['marital'], prefix='marital')
    df = pd.get_dummies(df, columns=['education'], prefix='education') # has unknown
    df['default'] = df['default'].map({'yes': 1, 'no': 0})
    df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
    df['loan'] = df['loan'].map({'yes': 1, 'no': 0})

    df['contact_simple'] = df['contact'].apply(simplify_contact)
    df = pd.get_dummies(df, columns=['contact_simple'], prefix='contact')
    df.drop('contact', axis=1, inplace=True)

    df.drop('day', axis=1, inplace=True)
    df = pd.get_dummies(df, columns=['month'], prefix='month')
    # df.drop('month', axis=1, inplace=True)

    df['prev_camp'] = (df['pdays'] != -1).astype(int)
    df['pdays'] = df['pdays'].replace(-1, 999)
    df = pd.get_dummies(df, columns=['poutcome'], prefix='poutcome') # has unknown. but means still ongoing 

    df['balance'] = df['balance'].clip(upper=df['balance'].quantile(0.99))

    return df

In [22]:
df.columns

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'y'],
      dtype='object')

In [23]:
df = first_trans(df)
df.drop('id', axis=1, inplace=True)

final = first_trans(final)
final_id = final['id']
final.drop('id', axis=1, inplace=True)

In [24]:
y = df['y']
X = df.drop('y', axis=1)

In [25]:
xgb = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=7,
    n_estimators=200,
    eval_metric='logloss',
    random_state=42,
)

selected_indices = [3,  4,  5, 10, 28, 30, 36, 37, 40, 41, 45, 46]

X_selected = X.iloc[:, selected_indices]
final_selected = final.iloc[:, selected_indices]

xgb.fit(X_selected, y)

y_test_proba = xgb.predict_proba(final_selected)[:, 1]

In [None]:
submission = pd.DataFrame({
    "id": final_id,
    "y": y_test_proba
})

submission.to_csv("submission.csv", index=False)