In [1]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('multi_classification_train.csv')
split = 0.8
s = int((dataset.values.shape[0])*split)
x_data = dataset.values[:s,1:21]
y_data = dataset.values[:s,21]
x_cv = dataset.values[s:,1:21]
y_cv = dataset.values[s:,21]

dataset = pd.read_csv('multi_classification_test.csv')
x_test = dataset.values[:,1:21]

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          12000 non-null  int64  
 1   Feature_1   12000 non-null  float64
 2   Feature_2   12000 non-null  float64
 3   Feature_3   12000 non-null  float64
 4   Feature_4   12000 non-null  float64
 5   Feature_5   12000 non-null  float64
 6   Feature_6   12000 non-null  float64
 7   Feature_7   12000 non-null  float64
 8   Feature_8   12000 non-null  float64
 9   Feature_9   12000 non-null  float64
 10  Feature_10  12000 non-null  float64
 11  Feature_11  12000 non-null  float64
 12  Feature_12  12000 non-null  float64
 13  Feature_13  12000 non-null  float64
 14  Feature_14  12000 non-null  float64
 15  Feature_15  12000 non-null  float64
 16  Feature_16  12000 non-null  float64
 17  Feature_17  12000 non-null  float64
 18  Feature_18  12000 non-null  float64
 19  Feature_19  12000 non-nul

In [2]:
"""
#feature scaling method 1
x_data/= x_data.max(axis=0)
print(x_data)
"""

'\n#feature scaling method 1\nx_data/= x_data.max(axis=0)\nprint(x_data)\n'

In [3]:
"""
#feature scaling method 2
x_data = (x_data - np.mean(x_data,axis = 0))/(x_data.max(axis=0)-x_data.min(axis=0))
print(x_data)
"""

'\n#feature scaling method 2\nx_data = (x_data - np.mean(x_data,axis = 0))/(x_data.max(axis=0)-x_data.min(axis=0))\nprint(x_data)\n'

In [4]:
#feature scaling method 3 (Z-Score)
m = np.mean(x_data, axis = 0)
s = np.std(x_data ,axis = 0)
def f_scale3(x,m,s):
    std=s
    x_f=(x - m)/(std+1e-8)
    return x_f
x_data = f_scale3(x_data,m,s)
x_cv = f_scale3(x_cv,m,s)
x_test = f_scale3(x_test,m,s)
print(x_data)

[[-0.56571807 -0.57317719 -2.15388543 ...  0.67685993  1.7249436
   1.45285191]
 [-0.92033606  0.22120171  0.54538883 ... -0.0352828  -0.99102716
   0.44137434]
 [ 2.66073284 -0.96573101 -1.21500083 ... -3.35956782  1.41955506
   0.70164703]
 ...
 [-0.62960131  0.31057538 -0.45244609 ... -1.76717744  0.90875409
  -0.3668362 ]
 [ 0.3852122  -0.40214937  0.52345662 ...  1.9526018   0.35162801
   1.62795315]
 [ 0.62637405  1.70790627  0.2404448  ...  0.48733446 -0.37363803
  -0.02859623]]


In [5]:
def compute_entropy(y):
    if len(y) == 0:
        return 0
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities))

In [6]:
def split_dataset(x, node_indices, feature, threshold):
    node_indices = np.array(node_indices)
    left_ = x[node_indices, feature] <= threshold
    right_ = ~left_
    return node_indices[left_], node_indices[right_]

In [7]:
def compute_information_gain(x, y, node_indices, feature, threshold):
    left_indices, right_indices = split_dataset(x, node_indices, feature, threshold)
    if len(left_indices) == 0 or len(right_indices) == 0:
        return 0
    y_node = y[node_indices]
    y_left = y[left_indices]
    y_right = y[right_indices]
    l_weight = len(y_left) / len(y_node)
    r_weight = len(y_right) / len(y_node)
    return compute_entropy(y_node) - (l_weight * compute_entropy(y_left) + r_weight * compute_entropy(y_right))

In [8]:
def find_best_threshold(x, y, node_indices, feature, num_intervals=50):
    feature_values = x[node_indices, feature]
    thresholds = np.linspace(feature_values.min(), feature_values.max(), num_intervals + 1)[:-1]
    information_gains = np.array([compute_information_gain(x, y, node_indices, feature, threshold) for threshold in thresholds])
    max_idx = np.argmax(information_gains)
    return thresholds[max_idx], information_gains[max_idx]

In [9]:
def get_best_split(x, y, node_indices, num_intervals=10):
    num_features = x.shape[1]
    best_feature = -1
    best_threshold = None
    best_info_gain = -np.inf
    for feature in range(num_features):
        threshold, info_gain = find_best_threshold(x, y, node_indices, feature, num_intervals)
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = feature
            best_threshold = threshold
    return best_feature, best_threshold

In [10]:
def predict_class(y, node_indices):
    if len(node_indices) == 0:
        return None
    values, counts = np.unique(y[node_indices], return_counts=True)
    return values[np.argmax(counts)]

In [11]:
def show_tree(x, y, node_indices, branch_name, max_depth, current_depth):
    if current_depth == max_depth or len(np.unique(y[node_indices])) == 1:
        predicted_class = predict_class(y, node_indices)
        print(f"{'   ' * current_depth}- {branch_name} leaf node with indices {node_indices}, predicted class: {predicted_class}")
        return
    best_feature, best_threshold = get_best_split(x, y, node_indices, num_intervals=10)
    if best_feature == -1 or best_threshold is None:
        predicted_class = predict_class(y, node_indices)
        print(f"{'   ' * current_depth}- {branch_name} leaf node with indices {node_indices}, predicted class: {predicted_class}")
        return
    print(f"{'   ' * current_depth}- {branch_name} Depth {current_depth}: Split on feature {best_feature} with threshold {best_threshold}")
    left_indices, right_indices = split_dataset(x, node_indices, best_feature, best_threshold)
    show_tree(x, y, left_indices, "Left", max_depth, current_depth + 1)
    show_tree(x, y, right_indices, "Right", max_depth, current_depth + 1)

In [12]:
show_tree(x_data, y_data, np.arange(x_data.shape[0]), "Root", max_depth= 6, current_depth=0)

- Root Depth 0: Split on feature 10 with threshold -0.035359949095335796
   - Left Depth 1: Split on feature 16 with threshold 0.033347013224559685
      - Left Depth 2: Split on feature 1 with threshold 0.9132982694158831
         - Left Depth 3: Split on feature 15 with threshold 0.8224767664216932
            - Left Depth 4: Split on feature 9 with threshold -1.0797372595673171
               - Left Depth 5: Split on feature 7 with threshold -0.21358400625674756
                  - Left leaf node with indices [   60    81   144   203   239   393   748   784   787  1013  1056  1363
  1453  1477  1527  1588  1653  1978  2033  2387  2923  3024  3166  3172
  3212  3264  3306  3338  3568  3603  3661  3857  3863  4057  4246  4403
  4689  4837  5076  5127  5261  5421  5478  5756  5925  5958  6194  6287
  6358  6408  6432  6523  6661  6708  6727  6946  7204  7212  7317  7374
  7637  7724  7742  7862  7905  7958  8015  8270  8322  8327  8392  8518
  8683  9205  9208  9298  9373  9669  9707  

In [13]:
def predict(x, y, samples, max_depth):
    num_samples = samples.shape[0]
    predictions = np.full(num_samples, None)
    sample_indices = np.arange(num_samples)

    def traverse_tree(node_indices, current_depth, sample_indices):
        if current_depth == max_depth or len(np.unique(y[node_indices])) == 1:
            predicted_class = predict_class(y, node_indices)
            predictions[sample_indices] = predicted_class
            return

        best_feature, best_threshold = get_best_split(x, y, node_indices, num_intervals=10)
        if best_feature == -1 or best_threshold is None:
            predicted_class = predict_class(y, node_indices)
            predictions[sample_indices] = predicted_class
            return

        left_indices, right_indices = split_dataset(x, node_indices, best_feature, best_threshold)
        left_ = samples[sample_indices, best_feature] <= best_threshold
        right_ = ~left_

        if np.any(left_):
            traverse_tree(left_indices, current_depth + 1, sample_indices[left_])
        if np.any(right_):
            traverse_tree(right_indices, current_depth + 1, sample_indices[right_])

    traverse_tree(np.arange(x.shape[0]), 0, sample_indices)
    return predictions

In [14]:
def f1_score(y_predic, y_true):
    tp = np.sum((y_true == 1) & (y_predic == 1))
    fp = np.sum((y_true == 0) & (y_predic == 1))
    fn = np.sum((y_true == 1) & (y_predic == 0))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    if precision + recall == 0:
        return 0
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [15]:
predictions_cv = np.array(predict(x_data, y_data, x_cv, max_depth=10)).astype(int)
print("Predictions on cv set:", predictions_cv)
print("F1 score:", f1_score(predictions_cv,y_cv))

predictions_test = np.array(predict(x_data, y_data, x_test, max_depth=10)).astype(int)
print("Predictions on test set:", predictions_test)

Predictions on cv set: [1 3 3 ... 1 3 2]
F1 score: 0.989387361312108
Predictions on test set: [3 1 1 ... 3 4 1]


In [16]:
y_test_pred_df = pd.DataFrame(predictions_test, columns=["Predicted"])
y_test_pred_df.to_csv("y_test_pred.csv", index=False)

print("Predictions saved to 'y_test_pred.csv'.")

Predictions saved to 'y_test_pred.csv'.
