In [None]:
%pip install pandas
%pip install numpy
%pip install sklearn

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('train.csv', low_memory=False, index_col='id')
data.head()

In [None]:
features = ['current_assets_over_short_term_liabilities', 'EBIT_over_total_assets', 'net_profit_over_total_assets', 'short_term_liabilities_over_total_assets', 'working_capital_over_total_assets', 'class']

In [None]:
datasets = data[features]
datasets.shape

#Remove all missing data from datasets

In [None]:
for col in features:
    datasets.drop(datasets.loc[datasets[col] == '?'].index, inplace=True)
    # print(datasets.loc[datasets[col] == '?'].index)

In [None]:
datasets.shape

In [None]:
#y must be -1, 1

In [None]:
y = datasets['class'].values
y[y == 0] = -1
features.pop() #remove class
X = datasets[features].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=102)
print('Training set:', X_train.shape, y_train.shape)
print('Val set:', X_val.shape, y_val.shape)

In [None]:
def check(y):
    assert set(y) == {-1,1}
    return y

In [None]:
def init_model(iters, X):
    n = X.shape[0]
    sample_weights = np.zeros((iters, n))
    stumps = np.zeros(iters, dtype= object)
    stump_weights = np.zeros(iters)
    errors = np.zeros(iters)
    return stumps, stump_weights, sample_weights, errors

In [None]:
def AdaBoostClf(X, y, iters= 10):
    n = X.shape[0]
    # Check y
    y = check(y)
    # Initialize
    stumps, stump_weights, sample_weights, errors = init_model(iters= iters, X= X)

    # First weight = 1/n
    sample_weights[0] = np.ones(shape= n) / n

    for i in range(iters):
        # Fit for stump: weak learner
        current_sew = sample_weights[i]
        stump = DecisionTreeClassifier(max_depth= 1, max_leaf_nodes= 2)
        stump = stump.fit(X, y, sample_weight= current_sew)

        # Calculate error
        stump_pred = stump.predict(X)
        error = current_sew[stump_pred != y].sum()
        stump_weight = np.log((1 - error) / error) / 2

        # New sample weight
        new_sew = current_sew * np.exp(-1 * stump_weight * y * stump_pred)

        # Renormalize weights
        new_sew = new_sew / new_sew.sum()

        # If not last iter, update sample weights for i+1
        if (i + 1) < iters:
            sample_weights[i+1] = new_sew

        # Save result
        errors[i] = error
        stumps[i] = stump
        stump_weights[i] = stump_weight

    return stumps, stump_weights, sample_weights


In [None]:
def predict(X, stumps, stump_weights):
    stump_preds = np.array([stump.predict(X) for stump in stumps])
    return np.sign(np.dot(stump_weights, stump_preds))

In [None]:
# Training
stumps1, stump_weights1, sample_weights1 = AdaBoostClf(X= X_train, y= y_train.reshape(-1), iters= 10)
predt = predict(X_val, stumps1, stump_weights1)


In [None]:
# Show metrics
print("Accuracy score: %f" % accuracy_score(y_val, predt))
print("Confusion Matrix:")
print(confusion_matrix(y_val, predt))
print(classification_report(y_val, predt))
print('Log loss:', log_loss(y_val, predt)/len(y_val))