In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load training and test datasets
df_train = pd.read_csv("buy_comp_train_data.csv")
df_test = pd.read_csv("buy_comp_test_data.csv")

In [2]:
# Convert categorical variables into numerical format using one-hot encoding
X_train = pd.get_dummies(df_train.drop('Buy Comp', axis=1))
y_train = df_train['Buy Comp']  # Keeping labels as 'Yes' and 'No'

X_test = pd.get_dummies(df_test.drop('Buy Comp', axis=1))
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

m = len(df_train)
# Initialize sample weights
weights = np.ones(m) / m

In [3]:
# Number of weak learners
n_learners = 5
learners = []
alphas = []

# AdaBoost Algorithm
for i in range(n_learners):
    # Train a weak learner (Decision Tree with max_depth=1)
    clf = DecisionTreeClassifier(max_depth=1, random_state=42)
    clf.fit(X_train, y_train, sample_weight=weights)
    
    # Predict on the training set
    y_pred_train = clf.predict(X_train)
    
    # Calculate weighted error based on 'Yes' and 'No'
    incorrect = (y_pred_train != y_train)
    weighted_error = np.dot(weights, incorrect) / np.sum(weights)
    
    # Calculate alpha (learner's weight)
    alpha = 0.5 * np.log((1 - weighted_error) / (weighted_error + 1e-10))
    alphas.append(alpha)
    
    # Update weights: We multiply by exp(+alpha) for misclassified and exp(-alpha) for correctly classified
    weights *= np.exp(alpha * incorrect * 1.0)  # increase weight for misclassified
    weights /= np.sum(weights)  # Normalize weights
    
    # Save the learner
    learners.append(clf)

    # Print progress
    print(f"Trained weak learner {i+1} with weighted error: {weighted_error:.4f}, alpha: {alpha:.4f}")

Trained weak learner 1 with weighted error: 0.3571, alpha: 0.2939
Trained weak learner 2 with weighted error: 0.2764, alpha: 0.4812
Trained weak learner 3 with weighted error: 0.3055, alpha: 0.4107
Trained weak learner 4 with weighted error: 0.3548, alpha: 0.2990
Trained weak learner 5 with weighted error: 0.3549, alpha: 0.2989


In [8]:
incorrect

0      True
1      True
2     False
3     False
4     False
5      True
6     False
7      True
8     False
9     False
10    False
11    False
12    False
13     True
Name: Buy Comp, dtype: bool

In [7]:
alphas

[0.2938933323110594,
 0.4812118248672834,
 0.4106935688369132,
 0.29902652537420915,
 0.298864646328832]

In [6]:
learners

[DecisionTreeClassifier(max_depth=1, random_state=42),
 DecisionTreeClassifier(max_depth=1, random_state=42),
 DecisionTreeClassifier(max_depth=1, random_state=42),
 DecisionTreeClassifier(max_depth=1, random_state=42),
 DecisionTreeClassifier(max_depth=1, random_state=42)]

In [9]:
y_pred_train

array(['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
       'Yes', 'Yes', 'Yes', 'Yes', 'Yes'], dtype=object)

In [4]:
# Predict on test data
test_predictions = np.zeros(X_test.shape[0])

for alpha, clf in zip(alphas, learners):
    # Convert predictions to 1 for 'Yes' and 0 for 'No' for weighted voting
    pred = np.where(clf.predict(X_test) == 'Yes', 1, 0)
    test_predictions += alpha * pred

# Final prediction based on weighted sum (convert back to 'Yes'/'No')
final_predictions = np.where(test_predictions > (0.5 * np.sum(alphas)), 'Yes', 'No')

# Print final predictions for test data
print(f"Predictions for test data: {final_predictions}")


Predictions for test data: ['Yes' 'No']
