In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier



In [49]:
file = pd.read_csv('df2_encoded.csv')
df = pd.DataFrame(file)
df.head()

Unnamed: 0,stag,event,gender,age,industry,profession,traffic,coach,head_gender,greywage,way,extraversion,independ,selfcontrol,anxiety,novator
0,7.030801,1,1,35.0,0,0,1,0,0,1,1,6.2,4.1,5.7,7.1,8.3
1,22.965092,1,1,33.0,0,0,1,0,1,1,1,6.2,4.1,5.7,7.1,8.3
2,15.934292,1,0,35.0,1,0,1,0,1,1,1,6.2,6.2,2.6,4.8,8.3
3,15.934292,1,0,35.0,1,0,1,0,1,1,1,5.4,7.6,4.9,2.5,6.7
4,8.410678,1,1,32.0,0,0,0,1,0,1,1,3.0,4.1,8.0,7.1,3.7


In [58]:


target = df['event']

# Drop the target variable from the features
features = df.drop('event', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)

clf1 = RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=6, min_samples_leaf=2, min_samples_split=5, n_estimators=30,
random_state=42)
clf2 = GaussianNB(var_smoothing=1e-13)
clf3 = LogisticRegression(max_iter = 1000, random_state = 42, C=1, solver='sag', penalty='l2')
clf4 = DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=1)
clf5 = MLPClassifier(max_iter=10000, learning_rate_init=0.01, random_state=42 ,tol=1e-3, early_stopping=True, activation='tanh', alpha= 0.0001, hidden_layer_sizes= (30,), learning_rate= 'constant', solver= 'adam')
clf6 = KNeighborsClassifier(algorithm= 'auto', leaf_size= 10, metric= 'manhattan', n_neighbors= 5, weights= 'distance')
clf7 = SVC(decision_function_shape= 'ovo', kernel= 'linear', probability=True)

voting_clf = VotingClassifier(
    estimators=[('RandomForestClassifier', clf1), ('GaussianNB', clf2), ('LogisticRegression', clf3), ('DecisionTreeClassifier', clf4), ('MLPClassifier', clf5), ('KNeighborsClassifier', clf6), ('SVC', clf7)],
    voting='soft'  # Use 'soft' for probabilities, 'hard' for predictions
)


In [63]:
def myvoting(X_train, X_test, y_train, y_test):
    # Define the classifiers to be included in the ensemble

    # Fit the ensemble classifier to the data
    voting_clf.fit(X_train, y_train)
    voting_clf.score(X_train, y_train)
    voting_clf.score(X_test, y_test)
    print(f"Accuracy Training: {voting_clf.score(X_train, y_train).round(2)}")
    print(f"Accuracy Testing: {voting_clf.score(X_test, y_test).round(2)}")
    print("*" * 50)

        

# Call the function for the training and testing data no Scaling

In [64]:
myvoting(X_train, X_test, y_train, y_test)


Accuracy Training: 1.0
Accuracy Testing: 0.64
**************************************************


# Call the function for the training and testing data with Standard Scaling

In [65]:
scalerS = StandardScaler()
X_train2 = scalerS.fit_transform(X_train)
X_test2 = scalerS.fit_transform(X_test)

myvoting(X_train2, X_test2, y_train, y_test)

Accuracy Training: 1.0
Accuracy Testing: 0.62
**************************************************


# Call the function for the training and testing data with MinMax Scaling


In [66]:

scalerM = MinMaxScaler()
X_train3 = scalerM.fit_transform(X_train)
X_test3 = scalerM.fit_transform(X_test)

myvoting(X_train3, X_test3, y_train, y_test)

Accuracy Training: 1.0
Accuracy Testing: 0.61
**************************************************
