In [213]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter

## Load training data

In [238]:
# Load data
df = pd.read_csv('./train.csv', encoding = 'ISO-8859-1')
df.head()

Unnamed: 0,stag,event,gender,age,industry,profession,traffic,coach,head_gender,greywage,way,extraversion,independ,selfcontrol,anxiety,novator
0,79.540041,0,m,23.0,IT,IT,youjs,my head,m,white,car,4.6,5.5,8.7,4.8,4.4
1,7.129363,1,f,21.0,Consult,HR,youjs,my head,m,white,bus,5.4,6.9,7.2,2.5,4.4
2,6.669405,0,f,27.0,Banks,HR,youjs,no,f,white,bus,9.4,5.5,3.3,4.0,7.5
3,20.041068,1,f,25.0,manufacture,HR,advert,yes,f,white,car,9.4,3.4,1.8,8.7,7.5
4,3.811088,1,f,32.0,etc,Marketing,youjs,no,m,white,bus,3.8,6.2,6.4,4.0,6.0


In [239]:
df.select_dtypes(exclude= 'object' ).columns

Index(['stag', 'event', 'age', 'extraversion', 'independ', 'selfcontrol',
       'anxiety', 'novator'],
      dtype='object')

## Load test data

In [240]:
test_df = pd.read_csv("./test.csv", encoding = 'ISO-8859-1')
test_df.head()


Unnamed: 0,stag,event,gender,age,industry,profession,traffic,coach,head_gender,greywage,way,extraversion,independ,selfcontrol,anxiety,novator
0,22.965092,1,m,33.0,Banks,HR,empjs,no,m,white,bus,6.2,4.1,5.7,7.1,8.3
1,8.936345,1,f,24.0,State,etc,advert,yes,f,white,foot,7.0,6.2,4.1,7.1,9.0
2,33.577002,1,f,25.0,etc,HR,KA,my head,f,white,foot,9.4,1.2,4.1,5.6,6.7
3,45.371663,1,f,29.0,etc,HR,referal,no,m,white,bus,3.8,3.4,5.7,8.7,7.5
4,36.960986,1,f,48.0,State,BusinessDevelopment,empjs,no,f,white,bus,4.6,6.2,4.9,5.6,6.7


## Data Preprocessing

In [241]:
# get unique value in every columns for categorical data (combined from test and train dataset)
train_unique_values = {}
for column in df.select_dtypes(include=['object']).columns:
    train_unique_values[column] = df[column].unique()

test_unique_values = {}
for column in test_df.select_dtypes(include=['object']).columns:
    test_unique_values[column] = df[column].unique()

unique_values = {}
for col in train_unique_values:
    unique_values[col] = np.unique(np.concatenate((train_unique_values[col], test_unique_values[col])))

unique_values


{'gender': array(['f', 'm'], dtype=object),
 'industry': array([' HoReCa', 'Agriculture', 'Banks', 'Building', 'Consult', 'IT',
        'Mining', 'Pharma', 'PowerGeneration', 'RealEstate', 'Retail',
        'State', 'Telecom', 'etc', 'manufacture', 'transport'],
       dtype=object),
 'profession': array(['Accounting', 'BusinessDevelopment', 'Commercial', 'Consult',
        'Engineer', 'Finanñe', 'HR', 'IT', 'Law', 'Marketing', 'PR',
        'Sales', 'Teaching', 'etc', 'manage'], dtype=object),
 'traffic': array(['KA', 'advert', 'empjs', 'friends', 'rabrecNErab', 'recNErab',
        'referal', 'youjs'], dtype=object),
 'coach': array(['my head', 'no', 'yes'], dtype=object),
 'head_gender': array(['f', 'm'], dtype=object),
 'greywage': array(['grey', 'white'], dtype=object),
 'way': array(['bus', 'car', 'foot'], dtype=object)}

In [242]:
# create label encoders for categorical data

label_encoders = {}
for column in unique_values:
    label_encoders[column] = LabelEncoder()
    label_encoders[column].fit_transform(unique_values[column])

label_encoders

{'gender': LabelEncoder(),
 'industry': LabelEncoder(),
 'profession': LabelEncoder(),
 'traffic': LabelEncoder(),
 'coach': LabelEncoder(),
 'head_gender': LabelEncoder(),
 'greywage': LabelEncoder(),
 'way': LabelEncoder()}

In [243]:
import pickle

pickle.dump(label_encoders, open("./label_encoders.pickle", "wb"))

In [219]:
# check duplicate values
df.duplicated().sum()

8

In [220]:
# drop duplicates
df = df.drop_duplicates()

In [221]:
print(df.duplicated().sum())
print(df.isna().sum())
df.head()

0
stag            0
event           0
gender          0
age             0
industry        0
profession      0
traffic         0
coach           0
head_gender     0
greywage        0
way             0
extraversion    0
independ        0
selfcontrol     0
anxiety         0
novator         0
dtype: int64


Unnamed: 0,stag,event,gender,age,industry,profession,traffic,coach,head_gender,greywage,way,extraversion,independ,selfcontrol,anxiety,novator
0,79.540041,0,m,23.0,IT,IT,youjs,my head,m,white,car,4.6,5.5,8.7,4.8,4.4
1,7.129363,1,f,21.0,Consult,HR,youjs,my head,m,white,bus,5.4,6.9,7.2,2.5,4.4
2,6.669405,0,f,27.0,Banks,HR,youjs,no,f,white,bus,9.4,5.5,3.3,4.0,7.5
3,20.041068,1,f,25.0,manufacture,HR,advert,yes,f,white,car,9.4,3.4,1.8,8.7,7.5
4,3.811088,1,f,32.0,etc,Marketing,youjs,no,m,white,bus,3.8,6.2,6.4,4.0,6.0


In [222]:
### FOR TRAIN DATA
# Handle missing values if any (simple strategy: fill with median for numerical, most common for categorical)
std_scalers = {}
svm_df = df.copy()

for column in df.columns:
    if df[column].dtype == 'object':  # Categorical
        df[column] = df[column].fillna(df[column].mode()[0])
        svm_df[column] = df[column]
    else:  # Numerical
        std_scalers[column] = StandardScaler()
        df[column] = df[column].fillna(df[column].median())
        # dont standardized for random forest
        svm_df[column] = std_scalers[column].fit_transform(df[[column]])

# Encode categorical variables
for column in unique_values:
    df[column] = label_encoders[column].transform(df[column])
    svm_df[column] = label_encoders[column].transform(svm_df[column])

X = df.drop('event', axis=1).values
y = df['event'].values

In [223]:
svm_df_test = test_df.copy()

### FOR TEST DATA
# Handle missing values and encode categorical variables just like for the training data
for column in test_df.columns:
    if test_df[column].dtype == 'object':  # Categorical
        test_df[column] = test_df[column].fillna(test_df[column].mode()[0])
        test_df[column] = label_encoders[column].transform(test_df[column])  # Use the same encoders
        svm_df_test[column] = test_df[column]
    else:  # Numerical
        test_df[column] = test_df[column].fillna(test_df[column].median())
        # dont standardized for random forest
        svm_df_test[column] = std_scalers[column].transform(test_df[[column]])

X_test = test_df.drop('event', axis=1).values
y_test = test_df['event'].values

X_svm_test = svm_df_test.drop('event', axis=1).values
y_svm_test = svm_df_test['event'].values


In [224]:
svm_df_test

Unnamed: 0,stag,event,gender,age,industry,profession,traffic,coach,head_gender,greywage,way,extraversion,independ,selfcontrol,anxiety,novator
0,-0.391202,0.994429,1,0.269258,2,6,2,1,1,1,0,0.332707,-0.804395,0.043093,0.814203,1.290906
1,-0.804290,0.994429,0,-1.033842,11,13,1,2,0,1,2,0.770608,0.424360,-0.775334,0.814203,1.662439
2,-0.078725,0.994429,0,-0.889053,13,6,0,0,0,1,2,2.084312,-2.501249,-0.775334,-0.052348,0.441688
3,0.268578,0.994429,0,-0.309897,13,6,6,1,1,1,0,-0.980997,-1.213981,0.043093,1.738525,0.866297
4,0.020919,0.994429,0,2.441091,11,1,2,1,0,1,0,-0.543095,0.424360,-0.366120,-0.052348,0.441688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,-1.045177,0.994429,0,1.717147,10,6,2,1,0,1,0,0.770608,0.014775,-0.775334,-0.976670,-0.354454
222,1.723577,-1.005602,0,-0.020319,10,6,6,2,1,1,2,0.332707,0.424360,-1.184547,-0.976670,1.662439
223,1.723577,-1.005602,0,-0.020319,10,6,6,2,1,1,2,-0.105194,0.424360,-0.366120,0.352043,0.866297
224,-0.698841,-1.005602,1,2.441091,14,9,3,1,1,1,0,-2.294701,0.833946,0.401155,1.738525,0.070155


# SVM

In [225]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        y_ = np.where(y <= 0, -1, 1)
        n_samples, n_features = X.shape

        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]

    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)

In [226]:
# Transform y to -1 and 1
X_svm = svm_df.drop('event', axis=1).values
y_svm = svm_df['event'].values
y_svm = np.where(y_svm <= 0, -1, 1)

In [227]:
# Training the model
svm_model = SVM(n_iters=3000)
svm_model.fit(X_svm, y_svm)

In [228]:
svm_model.w
X.shape

(895, 15)

In [229]:
# Making svm predictions
y_svm_pred = svm_model.predict(X_svm_test)

# Check accuracy
y_svm_test = np.where(y_svm_test <= 0, -1, 1)
accuracy = np.mean(y_svm_pred == y_svm_test)
print(f"SVM Accuracy: {accuracy}")

SVM Accuracy: 0.5265486725663717


# Random Forest

In [230]:

# Decision Tree Node
class DecisionNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain

        # for leaf node
        self.value = value

# Tree building functions
def entropy(y):
    class_labels = np.unique(y)
    entropy = 0
    for cls in class_labels:
        p_cls = len(y[y == cls]) / len(y)
        entropy -= p_cls * np.log2(p_cls)
    return entropy

def split_data(X, y, feature_index, threshold):
    left_indices = np.argwhere(X[:, feature_index] <= threshold).flatten()
    right_indices = np.argwhere(X[:, feature_index] > threshold).flatten()
    return (X[left_indices], y[left_indices]), (X[right_indices], y[right_indices])

def calculate_info_gain(parent, l_child, r_child):
    weight_l = len(l_child[1]) / len(parent[1])
    weight_r = len(r_child[1]) / len(parent[1])
    gain = entropy(parent[1]) - (weight_l * entropy(l_child[1]) + weight_r * entropy(r_child[1]))
    return gain

def best_split(X, y, features_indices):
    best_split = {}
    max_info_gain = -float("inf")

    # loop over all the features
    for feature_index in features_indices:
        feature_values = X[:, feature_index]
        possible_thresholds = np.unique(feature_values)
        # loop over all the feature values present in the data
        for threshold in possible_thresholds:
            # get current split
            (X_left, y_left), (X_right, y_right) = split_data(X, y, feature_index, threshold)
            # check if childs are not null
            if len(X_left) > 0 and len(X_right) > 0:
                current_info_gain = calculate_info_gain((X, y), (X_left, y_left), (X_right, y_right))
                if current_info_gain > max_info_gain:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = threshold
                    best_split["left"] = (X_left, y_left)
                    best_split["right"] = (X_right, y_right)
                    best_split["info_gain"] = current_info_gain
                    max_info_gain = current_info_gain

    return best_split

def build_tree(X, y, min_samples_split, max_depth, current_depth=0, features_indices=None):
    num_samples, num_features = np.shape(X)
    # init best split
    best_split_result = {}
    # if dataset is pure
    if len(np.unique(y)) == 1:
        leaf_value = np.unique(y)[0]
        return DecisionNode(value=leaf_value)

    # check if subsample size is reached
    if num_samples >= min_samples_split and current_depth <= max_depth:
        if not features_indices:
            features_indices = range(num_features)

        best_split_result = best_split(X, y, features_indices)
        # check if information gain is positive
        if best_split_result["info_gain"] > 0:
            left_subtree = build_tree(best_split_result["left"][0], best_split_result["left"][1], min_samples_split, max_depth, current_depth + 1, features_indices)
            right_subtree = build_tree(best_split_result["right"][0], best_split_result["right"][1], min_samples_split, max_depth, current_depth + 1, features_indices)
            return DecisionNode(best_split_result["feature_index"], best_split_result["threshold"], left_subtree, right_subtree, best_split_result["info_gain"])

    # compute leaf node
    leaf_value = max(y, key=list(y).count)
    return DecisionNode(value=leaf_value)

# Decision Tree Classifier
class DecisionTreeClassifier:
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def fit(self, X, y):
        self.root = build_tree(X, y, self.min_samples_split, self.max_depth)

    def predict(self, X):
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions

    def make_prediction(self, x, tree):
        if tree.value != None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

# Random Forest Classifier
class RandomForestClassifier:
    def __init__(self, n_trees=100, min_samples_split=2, max_depth=2, n_features=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier(self.min_samples_split, self.max_depth)
            X_sample, y_sample = self.bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[indices], y[indices]

    def predict(self, X):
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_predictions = np.swapaxes(tree_predictions, 0, 1)
        y_pred = [self.most_common_label(tree_prediction) for tree_prediction in tree_predictions]
        return np.array(y_pred)

    def most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

In [231]:
# Train the model
random_forest_model = RandomForestClassifier(n_trees=100, max_depth=10)
random_forest_model.fit(X, y)

In [232]:
y_rf_pred = random_forest_model.predict(X_test)

# Evaluate the model
accuracy = np.mean(y_rf_pred == y_test)
print(f"Random Forest Accuracy: {accuracy}")

Random Forest Accuracy: 0.6592920353982301


In [233]:
# save models
import pickle
pickle.dump(svm_model, open("./svm_model.pickle", "wb"))
pickle.dump(random_forest_model, open("./random_forest_model.pickle", "wb"))


In [234]:
pickle.dump({
    "svm": svm_model,
    "rf": random_forest_model,
    "label_encoders": label_encoders,
    "std_scalers": std_scalers,
}, open("./models.pickle", "wb"))