In [4]:
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
x_train = np.genfromtxt("data/x_train.csv", delimiter=",", skip_header=1)
best_features = np.genfromtxt("data/x_train.csv", delimiter=",", dtype=str, max_rows=1)
y_train = np.genfromtxt("data/y_train.csv", delimiter=",", skip_header=1)
y_features = np.genfromtxt("data/y_train.csv", delimiter=",", dtype=str, max_rows=1)

In [6]:
x_test = np.genfromtxt("data/x_test.csv",delimiter=",", skip_header=1)

In [7]:
def to_categorical(array, range_min, range_max, n_bins):
    # Filter array to include only values within the specified range
    filtered_values = array[(array >= range_min) & (array <= range_max)]
    
    # Calculate the bin edges using quantiles
    bin_edges = np.quantile(filtered_values, np.linspace(0, 1, n_bins + 1))
    
    def assign_bin(value):
        # Check if the value is NaN
        if np.isnan(value):
            return -1
        
        # If the value is outside the range, return it as is
        if value < range_min or value > range_max:
            return value
        
        # Assign bin based on which range the value falls into
        # We use right=True to ensure that values exactly equal to range_max are included in the last bin
        return np.digitize(value, bin_edges, right=True)
    
    return assign_bin

mapping_dict = {
    "GENHLTH": lambda value: value if value <= 9 else -1,
    "PHYSHLTH": to_categorical(array=x_train[:, best_features=="PHYSHLTH"].flatten(), range_min=0, range_max=30, n_bins=4),
    "MENTHLTH": to_categorical(array=x_train[:, best_features=="MENTHLTH"].flatten(), range_min=0, range_max=30, n_bins=4),
    "POORHLTH": to_categorical(array=x_train[:, best_features=="POORHLTH"].flatten(), range_min=0, range_max=30, n_bins=4),
    "HLTHPLN1": lambda value: value if not np.isnan(value) else -1,
    "MEDCOST": lambda value: value if value <= 7 else -1,
    "CHECKUP1": lambda value: value if value <= 8 else -1,
    "BPHIGH4": lambda value: value if value <= 7 else -1,
    "BPMEDS": lambda value: value if not np.isnan(value) else -1,
    "BLOODCHO": lambda value: value if not np.isnan(value) else -1,
    "CHOLCHK": lambda value: value if not np.isnan(value) else -1,
    # "CVDINFR4": lambda value: 1 if value == 1 else 0,
    # "CVDCRHD4": lambda value: 1 if value == 1 else 0,
    "TOLDHI2": lambda value: value if not np.isnan(value) else -1,
    "CVDSTRK3": lambda value: value if not np.isnan(value) else -1,
    "ASTHMA3": lambda value: value if not np.isnan(value) else -1,
    "ASTHNOW": lambda value: value if not np.isnan(value) else -1,
    "CHCSCNCR": lambda value: value if not np.isnan(value) else -1,
    "CHCOCNCR": lambda value: value if not np.isnan(value) else -1,
    "CHCCOPD1": lambda value: value if not np.isnan(value) else -1,
    "HAVARTH3": lambda value: value if not np.isnan(value) else -1,
    "ADDEPEV2": lambda value: value if not np.isnan(value) else -1,
    "CHCKIDNY": lambda value: value if not np.isnan(value) else -1,
    "DIABETE3": lambda value: value if not np.isnan(value) else -1,
    "SEX": lambda value: value if not np.isnan(value) else -1,
    "MARITAL": lambda value: value if not np.isnan(value) else -1,
    "EDUCA": lambda value: value if not np.isnan(value) else -1,
    "VETERAN3": lambda value: value if not np.isnan(value) else -1,
    "INCOME2": lambda value: value if not np.isnan(value) else -1,
    "INTERNET": lambda value : value if not np.isnan(value) else -1,
    "WTKG3": to_categorical(array=x_train[:, best_features=="WTKG3"].flatten(), range_min=23, range_max=295, n_bins=6),
    "QLACTLM2": lambda value : value if not np.isnan(value) else -1,
    "USEEQUIP": lambda value : value if not np.isnan(value) else -1,
    "BLIND": lambda value : value if not np.isnan(value) else -1,
    "DECIDE": lambda value : value if not np.isnan(value) else -1,
    "DIFFWALK": lambda value : value if not np.isnan(value) else -1,
    "DIFFDRES": lambda value : value if not np.isnan(value) else -1,
    "DIFFALON": lambda value : value if not np.isnan(value) else -1,
    "SMOKE100": lambda value : value if not np.isnan(value) else -1,
    "SMOKDAY2": lambda value : value if not np.isnan(value) else -1,
    "LASTSMK2": lambda value : value if not np.isnan(value) else -1,
    "USENOW3": lambda value : value if not np.isnan(value) else -1,
    "AVEDRNK2": to_categorical(array=x_train[:, best_features=="AVEDRNK2"].flatten(), range_min=1, range_max=76, n_bins=5),
    "DRNK3GE5": to_categorical(array=x_train[:, best_features=="DRNK3GE5"].flatten(), range_min=1, range_max=76, n_bins=5),
    "EXERANY2": lambda value : value if not np.isnan(value) else -1,
    # "EXERHMM1": lambda value: str(value//200) if value <= 959 and value not in [777,999] else -1,
    "LMTJOIN3": lambda value : value if not np.isnan(value) else -1,
    "FLUSHOT6": lambda value : value if not np.isnan(value) else -1,
    "PDIABTST": lambda value : value if not np.isnan(value) else -1,
    "PREDIAB1": lambda value : value if not np.isnan(value) else -1,
    "INSULIN": lambda value : value if not np.isnan(value) else -1,
    "CIMEMLOS": lambda value : value if not np.isnan(value) else -1,
    "_RFHLTH": lambda value : value if not np.isnan(value) else -1,
    "_HCVU651": lambda value : value if not np.isnan(value) else -1,
    "_RFHYPE5": lambda value : value if not np.isnan(value) else -1,
    "_CHOLCHK": lambda value : value if not np.isnan(value) else -1,
    "_RFCHOL": lambda value : value if not np.isnan(value) else -1,
    # "_MICHD": lambda value: value if value <= 2 else -1,
    "_LTASTH1": lambda value : value if not np.isnan(value) else -1,
    "_CASTHM1": lambda value : value if not np.isnan(value) else -1,
    "_DRDXAR1": lambda value : value if not np.isnan(value) else -1,
    "_AGEG5YR": lambda value : value if not np.isnan(value) else -1,
    "_AGE_G": lambda value : value if not np.isnan(value) else -1,
    "HTM4": to_categorical(array=x_train[:, best_features=="HTM4"].flatten(), range_min=0.91, range_max=2.44, n_bins=6),
    "_RFBMI5": lambda value : value if not np.isnan(value) else -1,
    "_EDUCAG": lambda value : value if not np.isnan(value) else -1,
    "_SMOKER3": lambda value : value if not np.isnan(value) else -1,
    "_RFBING5": lambda value : value if not np.isnan(value) else -1,
    "_BMI5CAT": lambda value : value if not np.isnan(value) else -1,
    "_RFDRHV5": lambda value : value if not np.isnan(value) else -1,
    "FTJUDA1_": to_categorical(array=x_train[:, best_features=="FTJUDA1_"].flatten(), range_min=0, range_max=99.99, n_bins=4),
    "MAXVO2_": to_categorical(array=x_train[:, best_features=="MAXVO2_"].flatten(), range_min=0, range_max=50.1, n_bins=6),
    "ACTIN11_": lambda value : value if not np.isnan(value) else -1,
    "ACTIN21_": lambda value : value if not np.isnan(value) else -1,
    "_PACAT1": lambda value : value if not np.isnan(value) else -1,
    "_PA150R2": lambda value : value if not np.isnan(value) else -1,
    "_PA300R2": lambda value : value if not np.isnan(value) else -1,
    "_PASTRNG":  lambda value : value if not np.isnan(value) else -1,
    "_PASTAE1": lambda value : value if not np.isnan(value) else -1,
    "_LMTACT1": lambda value : value if not np.isnan(value) else -1,
    "_LMTWRK1": lambda value : value if not np.isnan(value) else -1,
    "_LMTSCL1": lambda value : value if not np.isnan(value) else -1,
    "_INCOMG": lambda value : value if not np.isnan(value) else -1,
}

In [46]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

def select_features_with_low_nan_ratio(x_train, features_to_check, threshold=0.1):
    nan_ratios = {}
    for feature in features_to_check:
        nan_ratios[feature] = np.sum(np.isnan(x_train[:, best_features == feature])) / len(x_train)

    selected_features = [feature for feature in nan_ratios if nan_ratios[feature] < threshold]

    # print(f"Selected {len(selected_features)} features over {len(features_to_check)}")
    # print(nan_ratios)
    return selected_features

def apply_mapping(x_train, selected_features, mapping_dict):
    x_train_filtered = np.zeros((x_train.shape[0], len(selected_features)))
    for feature in selected_features:
        feature_values = x_train[:, best_features == feature].flatten()
        if feature_values.size > 0:
            x_train_filtered[:, selected_features.index(feature)] = np.array([mapping_dict[feature](value) for value in feature_values])
    return x_train_filtered

def fix_class_imbalance(X, y, target_value=1, dont_balance=False):
    """
    Fix class imbalance by oversampling the minority class or undersampling the majority class.
    
    Parameters:
    X (numpy.ndarray): Feature matrix of shape (n_samples, n_features)
    y (numpy.ndarray): Target vector of shape (n_samples,), containing values -1 and 1
    target_value (int): Class value to balance to (default is 1)
    
    Returns:
    X_balanced (numpy.ndarray): Feature matrix with balanced classes
    y_balanced (numpy.ndarray): Balanced target vector
    """
    if dont_balance:
        return X, y

    # Separate samples by class
    class_1_indices = np.where(y == target_value)[0]
    class_minus_1_indices = np.where(y != target_value)[0]
    
    # Find class counts
    class_1_count = len(class_1_indices)
    class_minus_1_count = len(class_minus_1_indices)
    
    if class_1_count == class_minus_1_count:
        # If classes are already balanced, return the original data
        return X, y
    
    elif class_1_count < class_minus_1_count:
        # If class 1 is the minority, oversample class 1
        oversample_size = class_minus_1_count - class_1_count
        oversampled_indices = np.random.choice(class_1_indices, oversample_size, replace=True)
        new_indices = np.concatenate([np.arange(len(y)), oversampled_indices])
    else:
        # If class -1 is the minority, oversample class -1
        oversample_size = class_1_count - class_minus_1_count
        oversampled_indices = np.random.choice(class_minus_1_indices, oversample_size, replace=True)
        new_indices = np.concatenate([np.arange(len(y)), oversampled_indices])
    
    # Create the balanced dataset
    X_balanced = X[new_indices]
    y_balanced = y[new_indices]
    
    return X_balanced, y_balanced

# calculate the accuracy, precision, recall and F1 score
def accuracy_precision_recall_f1(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    accuracy = (tp + tn) / len(y_true)
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return accuracy, precision, recall, f1

def split_data(x, y, ratio=0.8):
    indices = np.random.permutation(x.shape[0])
    train_indices = indices[:int(ratio * x.shape[0])]
    test_indices = indices[int(ratio * x.shape[0]):]
    return x[train_indices], y[train_indices], x[test_indices], y[test_indices]

def split_data_k_folds(x, y, n_folds=5):
    # Shuffle the data
    indices = np.random.permutation(x.shape[0])
    
    # Split indices into n equal-sized parts
    fold_sizes = np.full(n_folds, x.shape[0] // n_folds, dtype=int)  # Base size of each fold
    fold_sizes[:x.shape[0] % n_folds] += 1  # Distribute the remainder

    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        test_indices = indices[start:stop]  # Select current fold as test set
        train_indices = np.concatenate([indices[:start], indices[stop:]])  # Rest are training
        
        x_train, y_train = x[train_indices], y[train_indices]
        x_test, y_test = x[test_indices], y[test_indices]
        folds.append((x_train, y_train, x_test, y_test))
        
        current = stop

    return folds

def cleaning_x_pipeline(x_train, y_train, x_test, features, n_folds=5, dont_balance=False):
    np.random.seed(41)
    # keep only features with less than 10% nan values
    # selected_features = select_features_with_low_nan_ratio(x_train, features, threshold=0.1)
    # keep all features
    selected_features = select_features_with_low_nan_ratio(x_train, features, threshold=1)
    # cleaning
    x_train_filtered_mapped = apply_mapping(x_train, selected_features, mapping_dict)
    x_test_filtered_mapped = apply_mapping(x_test, selected_features, mapping_dict)

    # Label encoding

    le = LabelEncoder()
    combined = np.vstack((x_train_filtered_mapped, x_test_filtered_mapped))
    combined_encoded = np.apply_along_axis(le.fit_transform, 0, combined)
    x_train_encoded = combined_encoded[:x_train_filtered_mapped.shape[0], :]
    x_test_encoded = combined_encoded[x_train_filtered_mapped.shape[0]:, :]

    if n_folds==0:
        # fix class imbalance in the training set
        x_train_encoded_fixed, y_train_fixed = fix_class_imbalance(x_train_encoded, y_train, target_value=1, dont_balance=dont_balance)

        return x_train_encoded, x_train_encoded_fixed, y_train_fixed, x_test_encoded
    else:
        # split the data into k folds
        folds = split_data_k_folds(x_train_encoded, y_train, n_folds=n_folds)
        balanced_folds = []

        for x_train_fold, y_train_fold, x_test_fold, y_test_fold in folds:
            # fix class imbalance in the training set
            x_train_fold_fixed, y_train_fold_fixed = fix_class_imbalance(x_train_fold, y_train_fold, target_value=1, dont_balance=dont_balance)
            balanced_folds.append((x_train_fold, x_train_fold_fixed, y_train_fold, y_train_fold_fixed, x_test_fold, y_test_fold))

        return balanced_folds, x_test_encoded
    
def evaluate_model(x_train, y_train, x_test, final_features, dont_balance=False, n_folds=5, model=CategoricalNB()):
    y_train_mapped = (1 + y_train[:, 1]) / 2
    balanced_folds, x_test_encoded = cleaning_x_pipeline(x_train, y_train_mapped, x_test, final_features, n_folds=n_folds, dont_balance=dont_balance)

    # Initialize array metrics of size n_folds*4
    metrics_train = np.zeros((n_folds, 4))
    metrics_train_fixed = np.zeros((n_folds, 4))
    metrics_test = np.zeros((n_folds, 4))

    for i in range(len(balanced_folds)):
        x_train_fold, x_train_fold_fixed, y_train_fold, y_train_fold_fixed, x_test_fold, y_test_fold = balanced_folds[i]

        print("x", x_train_fold_fixed.shape)
        print("y", y_train_fold_fixed.shape)
        print("x_test", x_test.shape)
        model.fit(x_train_fold_fixed, y_train_fold_fixed)

        # Predict on the train, train_fixed and test set
        y_train_pred = model.predict(x_train_fold)
        y_train_fixed_pred = model.predict(x_train_fold_fixed)
        y_test_pred = model.predict(x_test_fold)

        # Calculate the accuracy, precision, recall and F1 score
        metrics_train[i] = accuracy_precision_recall_f1(y_train_fold, y_train_pred)
        metrics_train_fixed[i] = accuracy_precision_recall_f1(y_train_fold_fixed, y_train_fixed_pred)
        metrics_test[i] = accuracy_precision_recall_f1(y_test_fold, y_test_pred)

    # take the average
    metrics_train = np.mean(metrics_train, axis=0)
    metrics_train_fixed = np.mean(metrics_train_fixed, axis=0)
    metrics_test = np.mean(metrics_test, axis=0)

    return metrics_train, metrics_train_fixed, metrics_test

def fit_predict_model(x_train, y_train, x_test, final_features, dont_balance=False, model=CategoricalNB()):
    y_train_mapped = (1 + y_train[:, 1]) / 2
    x_train_encoded, x_train_encoded_fixed, y_train_fixed, x_test_encoded = cleaning_x_pipeline(x_train, y_train_mapped, x_test, final_features, dont_balance=dont_balance, n_folds=0)

    model.fit(x_train_encoded_fixed, y_train_fixed)
    y_pred = model.predict(x_test_encoded)
    y_train_pred = model.predict(x_train_encoded)

    # print metrics for the training set
    accuracy, precision, recall, f1 = accuracy_precision_recall_f1(y_train_mapped, y_train_pred)
    print(f"Training set: accuracy={accuracy:.2f}, precision={precision:.2f}, recall={recall:.2f}, F1={f1:.5f}")

    return y_pred, y_train_pred

In [47]:
import numpy as np
class RandomForest:

    class Tree:

        def __init__(self, max_depth=3, min_sample_split=2):
            self.max_depth = max_depth
            self.min_sample_split = min_sample_split
            self.root = None

        def fit(self, x, y, depth=0):

            if len(y) == 0:
                return None

            
            if x.shape[0] < self.min_sample_split or len(set(y)) == 1 or depth >= self.max_depth:
                leaf_value = self._calculate_leaf_value(y)
                self.root = RandomForest.TreeNode(value=leaf_value)
                return self.root
                
            
            best_features, best_threshold = self._best_split(x, y)

            left_x = x[:, best_features] <= best_threshold
            right_x = x[:, best_features] > best_threshold

            left_subtree = self.fit(x[left_x], y[left_x], depth + 1)
            right_subtree = self.fit(x[right_x], y[right_x], depth + 1)

            self.root = RandomForest.TreeNode(best_features, best_threshold, left_subtree, right_subtree)
            return self.root


        def predict(self, x):
            return self._traverse_tree(x, self.root)

        def _traverse_tree(self, x, treeNode):
            
            if treeNode.value is not None:
                return np.full((x.shape[0],), treeNode.value)  

            # Go right or left
            left_indices = x[:, treeNode.feature_index] <= treeNode.threshold
            right_indices = x[:, treeNode.feature_index] > treeNode.threshold

            # empty array for now
            predictions = np.empty(x.shape[0])

            # predict for left and right subtree
            if np.any(left_indices):
                predictions[left_indices] = self._traverse_tree(x[left_indices], treeNode.left)
            if np.any(right_indices):
                predictions[right_indices] = self._traverse_tree(x[right_indices], treeNode.right)

            return predictions
            

        def _calculate_leaf_value(self, y):
            value, count = np.unique(y, return_counts=True)
            return value[np.argmax(count)]


        def _gini_impurity(self, y):
            D = len(y)
            #print("N", D)
            _, count = np.unique(y, return_counts=True, axis=0)
            #print("Count", count.shape)
            gini = 1 - np.sum((count / D) ** 2)
            #print("gini", gini.shape)
            return gini
    
        def _entropy(self, y):
            N = y.shape
            _, count = np.unique(y, return_counts=True)
            p = count/N
            
            entropy = np.where(p > 0, -p* np.log2(p), 0)
            
            return entropy.sum()
        
        def _split(self, x, feature, threshold):
            #print(threshold[:,np.newaxis].shape)
            #print(x[np.newaxis, :,feature].shape)
            left_branch = np.where(x[:,feature] <= threshold)[0]
            right_branch = np.where(x[:, feature] > threshold)[0]
            #print("left_b", left_branch)
            return left_branch, right_branch
        

        
        
        def _best_split(self, x, y):

            best_features = 0
            best_threshold = 0

            best_gini_impurity = 1

            N, D = x.shape

            for i in range(D):
                unique = np.unique(x[:, i])
                unique = np.sort(unique)
                
                # Calculate midpoint between possible value
                threshold = (unique[:-1] + unique[1:])/2
                #print("threshold", threshold)

                """
                l, r = self._split(x, i, threshold)
                print("l", l)
                print("r", r)
                y_prime = y[np.newaxis,:] * np.ones(l.shape)
                print("y'", y_prime)
                print("y'", y_prime)

                gini_left = self._gini_impurity(y_prime[l])
                gini_right = self._gini_impurity(y_prime[r])
                print(gini_left)
                print(gini_right)
                

                break
                """
                
                for t in threshold:
                    l, r = self._split(x, i, t)
                    gini_left = self._gini_impurity(y[l])
                    gini_right = self._gini_impurity(y[r])
                    gini = (gini_left * len(l) + gini_right * len(r))/N


                    if gini < best_gini_impurity:
                        best_gini_impurity = gini
                        best_features = i
                        best_threshold = t
                

            
            return best_features, best_threshold


    def __init__(self, n_trees=20, max_depth=10, min_samples_split=2, seed=42):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.rng = np.random.default_rng(seed)

    def fit(self, x, y):
        # Create the trees
        self.list_tree = []
        for i in range(self.n_trees):

            bootstrap_x, bootstrap_y = self._bootstrap_sample(x, y)

            tree = RandomForest.Tree(self.max_depth, self.min_samples_split)

            tree.fit(bootstrap_x, bootstrap_y)


            self.list_tree.append(tree)
        

    def predict(self, x):
        predictions = np.zeros((len(x), self.n_trees))
        for i in range(self.n_trees):
            pred = self.list_tree[i].predict(x)
            #Convert -1 in 0
            predictions[:, i] = np.where(pred == -1, 0, 1)

        dominant_prediction = np.apply_along_axis(lambda p: np.bincount(p.astype(int)).argmax(), axis=1, arr=predictions)
        # Convert back 0 in -1
        dominant_prediction = np.where(dominant_prediction == 0, -1, 1)
        return dominant_prediction

    
    
    class TreeNode:

        def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
            self.feature_index = feature_index
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def _bootstrap_sample(self, x, y):

        N, _ = x.shape

        bootstrap_indices = self.rng.choice(N, N, replace=True)
        
        x_bootstrap = x[bootstrap_indices]

        y_bootstrap = y[bootstrap_indices]
        return x_bootstrap, y_bootstrap
    
    


"""
x_r = np.random.randint(20, size=(1000, 3))
y_r = np.random.randint(2, size=(1000))
x = np.array([[1, 2, 0],
              [4, 5, 10],
              [7, 8, 0],
              [10, 11, 12],
              [13, 14, 3]])

y = np.array([0, 1, 0, 1, 0])
y2 = np.array([1, 1, 1, 1, 1])


rf = RandomForest(n_trees=20, max_depth=5, min_samples_split=2)
a = rf.fit(x_r, y_r)
pred = rf.predict(x_r)
print(np.mean(np.abs(pred-y_r)))

"""
        

'\nx_r = np.random.randint(20, size=(1000, 3))\ny_r = np.random.randint(2, size=(1000))\nx = np.array([[1, 2, 0],\n              [4, 5, 10],\n              [7, 8, 0],\n              [10, 11, 12],\n              [13, 14, 3]])\n\ny = np.array([0, 1, 0, 1, 0])\ny2 = np.array([1, 1, 1, 1, 1])\n\n\nrf = RandomForest(n_trees=20, max_depth=5, min_samples_split=2)\na = rf.fit(x_r, y_r)\npred = rf.predict(x_r)\nprint(np.mean(np.abs(pred-y_r)))\n\n'

# Testing by splitting the data into training and testing sets

In [45]:
size_subset = 1000

# Get indices where y_train is -1 and 1
indices_minus1 = np.where(y_train[:, 1] == -1)[0][:500]  # First 500 indices where y_train is -1
indices_plus1 = np.where(y_train[:, 1] == 1)[0][:500]   # First 500 indices where y_train is 1

x_train_subset = np.vstack((x_train[indices_minus1, :], x_train[indices_plus1, :]))
y_train_subset = np.hstack((y_train[indices_minus1, 1], y_train[indices_plus1, 1]))

x_test_subset = x_test[:size_subset]

print(x_train_subset.shape)
print(y_train_subset.shape)
print(x_test_subset.shape)
print(np.unique(y_train_subset, return_counts=True))

rf = RandomForest(n_trees=50, max_depth=12, min_samples_split=5)
a = rf.fit(x_train_subset, y_train_subset)
pred_train = rf.predict(x_train_subset)
pred_test = rf.predict(x_test_subset)

print(np.mean((abs(pred_train - y_train_subset)/2)))
print(np.unique(y_train_subset, return_counts=True))


(1000, 322)
(1000,)
(1000, 322)
(array([-1.,  1.]), array([500, 500]))
0.5
(array([-1.,  1.]), array([500, 500]))


In [44]:
print(np.mean(abs(pred_train - y_train_subset)/2))

0.908


In [49]:
n_folds = 1
final_features = ['_RFHLTH', 'MAXVO2_', 'GENHLTH', 'CVDSTRK3', 'BLOODCHO', 'SEX', 'HLTHPLN1']
metrics_train, metrics_train_fixed, metrics_test = evaluate_model(x_train, y_train, x_test, final_features, dont_balance=False, n_folds=n_folds, model=RandomForest(5))

print(f"Train set: Accuracy={metrics_train[0]:.2f}, Precision={metrics_train[1]:.2f}, Recall={metrics_train[2]:.2f}, F1={metrics_train[3]:.5f}")
print(f"Train set fixed: Accuracy={metrics_train_fixed[0]:.2f}, Precision={metrics_train_fixed[1]:.2f}, Recall={metrics_train_fixed[2]:.2f}, F1={metrics_train_fixed[3]:.5f}")
print(f"Test set: Accuracy={metrics_test[0]:.2f}, Precision={metrics_test[1]:.2f}, Recall={metrics_test[2]:.2f}, F1={metrics_test[3]:.5f}")

(0,)


AttributeError: 'NoneType' object has no attribute 'value'

In [None]:
# # greedy algorithm to find the best set of features that maximizes f1_test
# # add features greedily one by one until the f1_test stops increasing
# from tqdm import tqdm  # Import tqdm for the progress bar

# def greedy_feature_selection(x_train, y_train, x_test, features, dont_balance=False):
#     features = select_features_with_low_nan_ratio(x_train, features, threshold=0.1)
#     n_features = len(features)
#     selected_features = []
#     remaining_features = features.copy()

#     best_f1 = 0
#     progress_bar = tqdm(total=n_features, desc="Selecting Features")

#     while remaining_features:
#         # Track the best feature and F1 score in the current iteration
#         best_feature = None
#         best_f1_iteration = 0
#         # print(remaining_features)
#         # Try adding each remaining feature and evaluate F1 score
#         for feature in remaining_features:
#             current_features = selected_features + [feature]  # Add feature to the selected set
#             # print(current_features)

#             metrics_train, metrics_train_fixed, metrics_test = evaluate_model(x_train, y_train, x_test, current_features, dont_balance=dont_balance, n_folds=5, model=CategoricalNB())
#             f1_test = metrics_test[3]

#             # Check if the current F1 score is the best so far
#             if f1_test > best_f1_iteration:
#                 best_f1_iteration = f1_test
#                 best_feature = feature
#         # Stop if no improvement is made
#         if best_f1_iteration <= best_f1:
#             break
        
#         # Update selected features and remaining features
#         selected_features.append(best_feature)
#         remaining_features.remove(best_feature)
#         best_f1 = best_f1_iteration
#         print(f"Best F1 score: {best_f1}")
#         print(f"Selected features: {selected_features}")
        
#         progress_bar.update(1)  # Update the progress bar
        
#     progress_bar.close()  # Close the progress bar when done
#     return selected_features, best_f1

# selected_features, best_f1 = greedy_feature_selection(x_train, y_train, x_test, list(mapping_dict.keys()), dont_balance=False)
# print(f"Selected features: {selected_features}")

# # found ['_RFHLTH', 'MAXVO2_', 'GENHLTH', 'CVDSTRK3', 'BLOODCHO', 'SEX', 'HLTHPLN1']

# Prediction for the real test set

In [None]:
final_features = ['_RFHLTH', 'MAXVO2_', 'GENHLTH', 'CVDSTRK3', 'BLOODCHO', 'SEX', 'HLTHPLN1']
y_pred_test, y_train_pred = fit_predict_model(x_train, y_train, x_test, final_features, dont_balance=False, model=RandomForest())

In [124]:
Ids = x_test[:,0]
y_pred_test_final = 2*y_pred_test-1

np.savetxt("data/submission_RandomForest_4.csv", np.array([Ids, y_pred_test_final]).T, delimiter=",", fmt="%d", header="Id,Prediction", comments="")