In [36]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.fillna(train.median(numeric_only=True), inplace=True)

for col in train.select_dtypes(include='object').columns:
    train[col].fillna(train[col].mode()[0], inplace=True)

for col in test.select_dtypes(include='object').columns:
    test[col].fillna(test[col].mode()[0], inplace=True)
    
# Separate features and target variable
X_train = train.drop('target', axis=1)
y_train = train['target'].apply(lambda x: 1 if x == 'yes' else 0)  # Binary target

# Handle categorical features directly with CatBoost
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print(categorical_features)

# Prepare the test data
X_test = test.copy()  # No need to drop 'id' since it wasn't in your dataset



['last contact date', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mode()[0], inplace=True)


In [34]:
# Initialize the CatBoost Classifier
model = CatBoostClassifier(
    iterations=100,         # Number of trees
    learning_rate=0.1,       # Step size shrinkage
    depth=6,                 # Maximum depth of trees
    random_seed=42,
    cat_features=categorical_features,  # Specify categorical features
    verbose=0                # Suppress training logs
)

In [38]:


# Train the model
model.fit(X_train, y_train)

# Make predictions on the train set for evaluation
y_pred_train = model.predict(X_train)
print(f'Train F1 Score (macro): {f1_score(y_train, y_pred_train, average="macro")}')

# Predict on the test set
y_test_pred = model.predict(X_test)
y_test_pred = ['yes' if y == 1 else 'no' for y in y_test_pred]

# Prepare submission file
submission = pd.DataFrame({'id': range(len(y_test_pred)), 'target': y_test_pred})
submission.to_csv('submission2.csv', index=False)

print(submission.head())  # Verify the submission format


Train F1 Score (macro): 0.7542586261559923
   id target
0   0     no
1   1     no
2   2     no
3   3     no
4   4    yes


In [None]:
X_train[]

Unnamed: 0,last contact date,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome
0,40780800.0,26,blue-collar,married,secondary,no,647,yes,no,cellular,357,2,331,1,other
1,56073600.0,52,technician,married,secondary,no,553,yes,no,telephone,160,1,-1,0,
2,91065600.0,44,blue-collar,married,secondary,no,1397,no,no,cellular,326,1,-1,0,
3,52617600.0,33,admin.,married,secondary,no,394,yes,no,telephone,104,3,-1,0,
4,2419200.0,31,entrepreneur,single,tertiary,no,137,no,no,cellular,445,2,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39206,18316800.0,31,technician,single,secondary,no,0,yes,no,cellular,16,34,-1,0,
39207,49852800.0,59,admin.,married,primary,no,6187,no,no,cellular,114,4,-1,0,
39208,13219200.0,36,blue-collar,married,secondary,no,63,yes,no,,56,8,-1,0,
39209,69897600.0,28,student,single,secondary,no,5916,no,no,cellular,93,5,-1,0,


In [29]:
X_train[X_train['job'] == 'Nan']

Unnamed: 0,last contact date,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome


In [8]:
X_train['last contact date'] = pd.to_datetime(X_train['last contact date']).apply(lambda x: x.timestamp())

In [26]:
nan_counts = train.isnull().sum()

# Display the result
print(nan_counts)

last contact date        0
age                      0
job                    229
marital                  0
education             1467
default                  0
balance                  0
housing                  0
loan                     0
contact              10336
duration                 0
campaign                 0
pdays                    0
previous                 0
poutcome             29451
target                   0
dtype: int64
