In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, GridSearchCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelBinarizer, MinMaxScaler, KBinsDiscretizer
from sklearn.tree import DecisionTreeClassifier

In [2]:
def preprocess_data(df, columns):
    df.drop(columns, axis=1, inplace=True)
    return df.replace('?', np.nan)

columns_to_drop = ['RowID', 'race', 'sex', 'native-country']
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship']

labeled_data = pd.read_excel('./data/existing-customers.xlsx')
labeled_data = preprocess_data(labeled_data, columns_to_drop)

print(labeled_data.info())
print(labeled_data.columns)
print(labeled_data.head())

unlabeled_data = pd.read_excel('./data/potential-customers.xlsx')
unlabeled_data = preprocess_data(unlabeled_data, columns_to_drop)

  warn("Workbook contains no default style, apply openpyxl's default")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   education       32561 non-null  object
 3   education-num   32561 non-null  int64 
 4   marital-status  32561 non-null  object
 5   occupation      30718 non-null  object
 6   relationship    32561 non-null  object
 7   capital-gain    32561 non-null  int64 
 8   capital-loss    32561 non-null  int64 
 9   hours-per-week  32561 non-null  int64 
 10  class           32561 non-null  object
dtypes: int64(5), object(6)
memory usage: 2.7+ MB
None
Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'capital-gain', 'capital-loss',
       'hours-per-week', 'class'],
      dtype='object')
   age         workclass  education  education-num      marital

  warn("Workbook contains no default style, apply openpyxl's default")


In [3]:
all_data = pd.concat([labeled_data, unlabeled_data], axis=0)
categorical_unique_list = [all_data[c].unique().tolist() for c in categorical_columns]

In [4]:
def missing_data(df):
    missing_values = df.isnull().sum().reset_index()
    missing_values = missing_values.rename(columns={'index': 'columns', 0: 'missing values'})
    missing_values['Percentage missing'] = missing_values['missing values'] / len(df) * 100
    return missing_values

missing_data(labeled_data)

Unnamed: 0,columns,missing values,Percentage missing
0,age,0,0.0
1,workclass,1836,5.638647
2,education,0,0.0
3,education-num,0,0.0
4,marital-status,0,0.0
5,occupation,1843,5.660146
6,relationship,0,0.0
7,capital-gain,0,0.0
8,capital-loss,0,0.0
9,hours-per-week,0,0.0


In [5]:
X = labeled_data.drop("class", axis=1)
y = labeled_data["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', categories=categorical_unique_list)
oe = OrdinalEncoder(categories=categorical_unique_list)
lb = LabelBinarizer()

ohe_pipeline = make_column_transformer(
    (ohe, categorical_columns),
    remainder='passthrough'
)
oe_pipeline = make_column_transformer(
    (oe, categorical_columns),
    remainder='passthrough'
)
y_train = lb.fit_transform(y_train)

In [7]:
classifiers = [
    DecisionTreeClassifier(max_depth=3),
    KNeighborsClassifier(),
    CategoricalNB(),
    BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=3), n_estimators=10),
    AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=3), n_estimators=50),
    RandomForestClassifier(n_estimators=100, max_depth=3)
]

pipelines = dict()
tree_based_classifiers = ['DecisionTreeClassifier', 'BaggingClassifier', 'AdaBoostClassifier', 'RandomForestClassifier']

for classifier in classifiers:
    if classifier.__class__.__name__ in tree_based_classifiers:
        encoder = ohe_pipeline
    else:
        encoder = oe_pipeline
    steps = [('encoder', encoder), ('scaler', MinMaxScaler()), ('imputer', KNNImputer()), ('sample', SMOTE()), ('model', classifier)]
    if classifier.__class__.__name__ not in tree_based_classifiers:
        steps.insert(3, ('discretizer', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')))
    pipeline = Pipeline(steps=steps)
    pipelines[classifier.__class__.__name__] = pipeline

In [8]:
for name, pipeline in pipelines.items():
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
    scores = cross_val_score(pipeline, X_train, y_train, scoring='recall_weighted', cv=cv, n_jobs=-1)
    score = np.mean(scores)
    print('Recall Score: %.3f' % score)
    print(name, scores)
    pipeline.fit(X_train, y_train)

Recall Score: 0.763
DecisionTreeClassifier [0.76008772 0.77631579 0.75208425 0.77226854 0.76831944 0.76393155
 0.75120667 0.74857394 0.75208425 0.7577885  0.75482456 0.75219298
 0.78367705 0.77621764 0.77139096 0.76700307 0.738043   0.75340061
 0.77051338 0.7538394  0.76798246 0.75131579 0.76480913 0.75910487
 0.76480913 0.82053532 0.75734971 0.7656867  0.76261518 0.75954366]
Recall Score: 0.794
KNeighborsClassifier [0.80657895 0.78289474 0.78806494 0.80473892 0.79464677 0.78762615
 0.79157525 0.77841158 0.79201404 0.79991224 0.79254386 0.79912281
 0.80912681 0.7898201  0.77446248 0.79113646 0.82755595 0.79113646
 0.78499342 0.78806494 0.80219298 0.78464912 0.80254498 0.78543221
 0.80693287 0.78279947 0.8016674  0.785871   0.81526986 0.78543221]
Recall Score: 0.769
CategoricalNB [0.76842105 0.76578947 0.75559456 0.76437034 0.77928916 0.76437034
 0.77358491 0.77314612 0.76700307 0.7819219  0.77324561 0.77192982
 0.7617376  0.76788065 0.77753401 0.75734971 0.76656428 0.75515577
 0.769635

In [9]:
for name, pipeline in pipelines.items():
    y_pred_test = pipeline.predict(X_test)
    print('--------------------------------')
    print(name)
    print(classification_report(lb.transform(y_test), y_pred_test, target_names=lb.classes_))
    print('--------------------------------')

--------------------------------
DecisionTreeClassifier
              precision    recall  f1-score   support

       <=50K       0.94      0.73      0.82      7438
        >50K       0.50      0.84      0.62      2331

    accuracy                           0.76      9769
   macro avg       0.72      0.79      0.72      9769
weighted avg       0.83      0.76      0.77      9769

--------------------------------
--------------------------------
KNeighborsClassifier
              precision    recall  f1-score   support

       <=50K       0.88      0.86      0.87      7438
        >50K       0.59      0.63      0.61      2331

    accuracy                           0.81      9769
   macro avg       0.73      0.74      0.74      9769
weighted avg       0.81      0.81      0.81      9769

--------------------------------
--------------------------------
CategoricalNB
              precision    recall  f1-score   support

       <=50K       0.94      0.75      0.83      7438
        >50K  

In [10]:
parameters_for_testing = {
    "model__estimator__max_features": [2, 5, 10],
    "model__estimator__max_depth": [3, 5, None],
    "model__estimator__min_samples_split": [2, 5, 10],
    "model__estimator__criterion": ['gini', 'entropy'],
}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1)
grid_cv = GridSearchCV(estimator=pipelines['BaggingClassifier'], param_grid=parameters_for_testing, scoring='recall_weighted', cv=cv, n_jobs=-1)
result = grid_cv.fit(X_train, y_train)

print("Best: %f using %s" % (result.best_score_, result.best_params_))
print(classification_report(lb.transform(y_test), result.predict(X_test), target_names=lb.classes_))

Best: 0.833450 using {'model__estimator__criterion': 'gini', 'model__estimator__max_depth': None, 'model__estimator__max_features': 10, 'model__estimator__min_samples_split': 10}
              precision    recall  f1-score   support

       <=50K       0.91      0.86      0.89      7438
        >50K       0.62      0.73      0.67      2331

    accuracy                           0.83      9769
   macro avg       0.77      0.80      0.78      9769
weighted avg       0.84      0.83      0.84      9769



In [11]:
y_pred = result.predict_proba(unlabeled_data)

print(y_pred)
print(lb.inverse_transform(y_pred))

[[1.         0.        ]
 [0.80641414 0.19358586]
 [0.48398629 0.51601371]
 ...
 [0.0852381  0.9147619 ]
 [0.38373016 0.61626984]
 [0.04041667 0.95958333]]
['<=50K' '<=50K' '>50K' ... '>50K' '>50K' '>50K']


In [12]:
# x * 0.1 * 980 + (1 - x) * 0.05 * -310 > 10
# => x > 51/227
customer_id = []
expected_profit = 0
for i in range(len(y_pred)):
    estimated_profit = (y_pred[i][1] * 0.1 * 980 + y_pred[i][0] * 0.05 * (-310)) - 10
    if estimated_profit > 0:
        customer_id.append(i)
        expected_profit += estimated_profit

print(expected_profit)
print(len(customer_id))

317984.43212060904
6914


In [14]:
np.savetxt("promotion.txt", customer_id, fmt="%d")