In [None]:
# Libraries
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import seaborn as sb
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings("ignore")

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
# Import datasets
train = pd.read_csv("train_data.csv")
X_test = pd.read_csv("test_data.csv")
samples = pd.read_csv("sampleSubmission.csv")

# Data Cleaning

In [None]:
# Concatenate Train & Test for Data Cleaning
combined_data = pd.concat([train, X_test], axis=0)

# Calculate the mean for filling missing values
mean_values_except_last = combined_data.iloc[:, :-1].mean()

# Fill missing values for all columns except the last column
combined_data.iloc[:, :-1] = combined_data.iloc[:, :-1].fillna(mean_values_except_last)

combined_data

Unnamed: 0,Name,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,3Y Dividend per Share Growth (per Share),Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector,Class
0,SBFG,4.520300e+07,0.051400,0.000000e+00,4.520300e+07,0.000000e+00,2.754700e+07,3.157800e+07,1.362500e+07,0.000000e+00,...,0.205100,0.000000,0.00000,0.07430,0.10080,-0.072900,0.000000,0.009600,Financial Services,0.0
1,FOMX,3.669000e+06,-0.336200,1.300000e+04,3.656000e+06,5.777900e+07,1.149100e+07,6.927000e+07,-6.561400e+07,0.000000e+00,...,0.000000,-0.692200,0.00000,-0.40830,-0.54460,-1.000000,1.231100,0.246200,Healthcare,0.0
2,VIA,1.326300e+10,0.953124,7.436000e+09,5.827000e+09,1.061590e+08,3.005000e+09,3.228000e+09,2.599000e+09,9.963788e+07,...,0.005444,9.660352,0.11746,1.08525,0.22719,1.457853,0.383984,0.719938,Consumer Cyclical,0.0
3,ABM,5.453600e+09,0.060000,4.881200e+09,5.724000e+08,0.000000e+00,4.366000e+08,4.706000e+08,1.018000e+08,1.920000e+07,...,0.031300,0.291700,0.00000,0.67310,0.37820,3.391400,0.000000,0.064600,Industrials,0.0
4,THS,6.307100e+09,0.021400,5.226700e+09,1.080400e+09,0.000000e+00,7.007000e+08,1.493200e+09,-4.128000e+08,1.268000e+08,...,0.000000,-0.231200,-0.06100,-0.11710,-0.11810,-0.087900,0.000000,-0.059200,Consumer Defensive,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1483,SUNW,7.744800e+07,-0.103800,6.378500e+07,1.366300e+07,0.000000e+00,1.840800e+07,1.997700e+07,-6.314000e+06,9.240000e+05,...,0.000000,-0.189100,0.31110,-0.01280,-0.32770,-0.528500,0.000000,-0.225400,Technology,
1484,TACT,5.631100e+07,-0.016100,2.964900e+07,2.666200e+07,4.303000e+06,1.554500e+07,1.984800e+07,6.814000e+06,3.300000e+04,...,0.041300,0.034300,-0.08570,0.05950,0.10640,0.000000,-0.027600,0.096700,Technology,
1485,TCCO,3.725006e+06,0.476500,1.917890e+06,1.807116e+06,1.584210e+06,2.144532e+06,3.728742e+06,-1.921626e+06,0.000000e+00,...,0.000000,5.528200,-0.17370,-0.28080,-0.38640,0.000000,0.913300,-0.197000,Technology,
1486,USATP,1.040000e+08,0.953124,7.700000e+07,2.700000e+07,1.061590e+08,2.500000e+07,2.700000e+07,0.000000e+00,1.000000e+06,...,0.005444,9.660352,0.11746,1.08525,0.22719,1.457853,0.383984,0.719938,Technology,


In [None]:
# Drop discrete columns
combined_data.drop(columns=['Name', 'Sector'], inplace=True)

In [None]:
combined_data

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,5Y Dividend per Share Growth (per Share),3Y Dividend per Share Growth (per Share),Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Class
0,4.520300e+07,0.051400,0.000000e+00,4.520300e+07,0.000000e+00,2.754700e+07,3.157800e+07,1.362500e+07,0.000000e+00,1.362500e+07,...,0.000000,0.205100,0.000000,0.00000,0.07430,0.10080,-0.072900,0.000000,0.009600,0.0
1,3.669000e+06,-0.336200,1.300000e+04,3.656000e+06,5.777900e+07,1.149100e+07,6.927000e+07,-6.561400e+07,0.000000e+00,-6.455100e+07,...,0.000000,0.000000,-0.692200,0.00000,-0.40830,-0.54460,-1.000000,1.231100,0.246200,0.0
2,1.326300e+10,0.953124,7.436000e+09,5.827000e+09,1.061590e+08,3.005000e+09,3.228000e+09,2.599000e+09,9.963788e+07,2.212000e+09,...,-0.012527,0.005444,9.660352,0.11746,1.08525,0.22719,1.457853,0.383984,0.719938,0.0
3,5.453600e+09,0.060000,4.881200e+09,5.724000e+08,0.000000e+00,4.366000e+08,4.706000e+08,1.018000e+08,1.920000e+07,1.260000e+07,...,0.032300,0.031300,0.291700,0.00000,0.67310,0.37820,3.391400,0.000000,0.064600,0.0
4,6.307100e+09,0.021400,5.226700e+09,1.080400e+09,0.000000e+00,7.007000e+08,1.493200e+09,-4.128000e+08,1.268000e+08,-5.246000e+08,...,0.000000,0.000000,-0.231200,-0.06100,-0.11710,-0.11810,-0.087900,0.000000,-0.059200,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1483,7.744800e+07,-0.103800,6.378500e+07,1.366300e+07,0.000000e+00,1.840800e+07,1.997700e+07,-6.314000e+06,9.240000e+05,-7.222000e+06,...,0.000000,0.000000,-0.189100,0.31110,-0.01280,-0.32770,-0.528500,0.000000,-0.225400,
1484,5.631100e+07,-0.016100,2.964900e+07,2.666200e+07,4.303000e+06,1.554500e+07,1.984800e+07,6.814000e+06,3.300000e+04,6.772000e+06,...,0.422900,0.041300,0.034300,-0.08570,0.05950,0.10640,0.000000,-0.027600,0.096700,
1485,3.725006e+06,0.476500,1.917890e+06,1.807116e+06,1.584210e+06,2.144532e+06,3.728742e+06,-1.921626e+06,0.000000e+00,-1.913127e+06,...,-1.000000,0.000000,5.528200,-0.17370,-0.28080,-0.38640,0.000000,0.913300,-0.197000,
1486,1.040000e+08,0.953124,7.700000e+07,2.700000e+07,1.061590e+08,2.500000e+07,2.700000e+07,0.000000e+00,1.000000e+06,-2.000000e+06,...,-0.012527,0.005444,9.660352,0.11746,1.08525,0.22719,1.457853,0.383984,0.719938,


In [None]:
# Now split the train and test again into original form

# Check the lengths of train and X_test
train_length = len(train)
X_test_length = len(X_test)

# Split the combined data back into train and test sets
train = combined_data.iloc[:train_length]
X_test = combined_data.iloc[train_length:train_length + X_test_length]

print("Train set shape:", train.shape)
print("Test set shape:", X_test.shape)

Train set shape: (3459, 222)
Test set shape: (1488, 222)


In [None]:
# Data Imbalance Issue
train['Class'].value_counts()

0.0    2498
1.0     961
Name: Class, dtype: int64

In [None]:
# Separate data into two classes
class_0 = train[train['Class'] == 0]
class_1 = train[train['Class'] == 1]

# Undersample class 0 to match the size of class 1
class_0_sampled = class_0.sample(n=len(class_1), random_state=42)

# Concatenate the sampled class 0 and class 1
train = pd.concat([class_0_sampled, class_1])

In [None]:
train['Class'].value_counts()

0.0    961
1.0    961
Name: Class, dtype: int64

In [None]:
# Check for duplicate rows
duplicate_rows = train[train.duplicated()]
print("Number of duplicate rows:", len(duplicate_rows))

Number of duplicate rows: 160


In [None]:
# Checking which column is category
category_columns = train.select_dtypes(include=['category']).columns
print(category_columns)

Index([], dtype='object')


In [None]:
# Splitting train into X_train & y_train
X_train = train.iloc[:, :-1] # Dropped Class here thats why there is only Class in X_test
y_train = train.iloc[:, -1] # keeping only the target column

In [None]:
X_test.drop(columns=['Class'], inplace=True)

# Split

In [None]:
# Train, Validation, Test
X_new_train, X_val, y_new_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) # Not sure if we have to use this, so it will depend

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(X_train, y_train)
y_pred = log.predict(X_test)

samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesLogReg.csv', index=False)
# Score: 0.53945

In [None]:
# Validation
log_val = LogisticRegression()
log_val.fit(X_new_train, y_new_train)
y_pred = log_val.predict(X_val)

#from sklearn.metrics import confusion_matrix
#confusion_matrix = confusion_matrix(y_val, y_pred)
#confusion_matrix

#clf.score(X_val, y_val)
#print(classification_report(y_val, y_pred))

roc_auc_score(y_val, y_pred)

0.5

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)


samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesBayesian.csv', index=False)
# Score: 0.51995

In [None]:
# Validation
from sklearn.naive_bayes import GaussianNB

nb_val = GaussianNB()
nb_val.fit(X_new_train, y_new_train)
y_pred = nb_val.predict(X_val)

roc_auc_score(y_val, y_pred)

0.5187250243006803

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Finding the best parameter
knn = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}

#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)

#fit model to training data
knn_gs.fit(X_train, y_train)

#save best model
knn_best = knn_gs.best_estimator_

#check best n_neigbors value
print(knn_gs.best_params_)


knn = KNeighborsClassifier(n_neighbors = 22)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# set k from 1 to 3
#or k in range (1,4):
 # knn = KNeighborsClassifier(n_neighbors = k)
  #knn.fit(X_train, y_train)
  #y_pred = knn.predict(X_test)


samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesKNN.csv', index=False)
# Score:  0.61734 (neightbor 20가 13보다 높음. 13 = 0.60464)
# neighbor = 22  =  0.63254

{'n_neighbors': 13}


In [None]:
# Validation

# Finding the best parameter
knn_val = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}

#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn_val, params_knn, cv=5)

#fit model to training data
knn_gs.fit(X_new_train, y_new_train)

#save best model
knn_best = knn_gs.best_estimator_

#check best n_neigbors value
print(knn_gs.best_params_)

from sklearn.neighbors import KNeighborsClassifier

knn_val = KNeighborsClassifier(n_neighbors = 20)
knn_val.fit(X_new_train, y_new_train)
y_pred = knn_val.predict(X_val)
roc_auc_score(y_val, y_pred)

{'n_neighbors': 3}


0.5730775461712928

In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import pandas as pd

# Create your model
model = LinearRegression()

# Define the number of splits for cross-validation
num_splits = 5

# Create a KFold object to define how you want to split your data
kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)

# Perform cross-validation
y_pred_cv = cross_val_predict(model, X_train, y_train, cv=kf)

if len(y_pred_cv) == len(samples):
    samples['Class'] = y_pred_cv
    samples_subset = samples[['Name', 'Class']]
    samples_subset.to_csv('crossVal.csv', index=False)
#Score: 0.46832

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)

param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)

%time grid.fit(X_train, y_train)
print(grid.best_params_)

svm = grid.best_estimator_
y_pred = svm.predict(X_test)

samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesSVM.csv', index=False)
# Score: 0.59172

CPU times: user 53.8 s, sys: 29.5 s, total: 1min 23s
Wall time: 1min 2s
{'svc__C': 50, 'svc__gamma': 0.001}


In [None]:
# Validation
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV


pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)

param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)

%time grid.fit(X_new_train, y_new_train)
print(grid.best_params_)

svm_val = grid.best_estimator_
yfit = svm_val.predict(X_val)
roc_auc_score(y_val, yfit)

CPU times: user 42.4 s, sys: 28.3 s, total: 1min 10s
Wall time: 46.1 s
{'svc__C': 10, 'svc__gamma': 0.005}


0.6078680203045685

In [None]:
svc = SVC(kernel='rbf', class_weight='balanced')
pca = PCA(whiten=True)
svm_model = SVC(kernel='rbf', class_weight='balanced')
svm_model = make_pipeline(pca, svc)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesSVM_2.csv', index=False)
# Score: 0.59008

In [None]:
# Validation
svc = SVC(kernel='rbf', class_weight='balanced')
pca = PCA(whiten=True)
svm_model_val = SVC(kernel='rbf', class_weight='balanced')
svm_model_val = make_pipeline(pca, svc)
svm_model_val.fit(X_new_train, y_new_train)
y_pred = svm_model_val.predict(X_val)

roc_auc_score(y_val, y_pred)

0.632047197321525

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesDT.csv', index=False)
# Score: 0.54957

In [None]:
# Validation
dt_val = tree.DecisionTreeClassifier()
dt_val.fit(X_new_train, y_new_train)
y_predict = dt_val.predict(X_val)

roc_auc_score(y_val, y_predict)

0.5644913057565613

In [None]:
# Entropy
from sklearn.tree import DecisionTreeClassifier

dt_entropy = DecisionTreeClassifier(criterion  = "entropy", random_state = 0)
dt_entropy.fit(X_train, y_train)
y_pred = dt_entropy.predict(X_test)


samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesDTEntropy.csv', index=False)
# Score: 0.56210

In [None]:
# Validation
dt_entropy_val = DecisionTreeClassifier(criterion  = "entropy", random_state = 0)
dt_entropy_val.fit(X_new_train, y_new_train)
y_pred = dt_entropy_val.predict(X_val)

roc_auc_score(y_val, y_pred)

0.6042634193757425

In [None]:
# Gini
dt_gini = DecisionTreeClassifier(criterion  = "gini", random_state = 0)
dt_gini.fit(X_train, y_train)
y_pred = dt_gini.predict(X_test)

samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesDTGini.csv', index=False)
# Score: 0.54796

In [None]:
# Validation
dt_gini_val = DecisionTreeClassifier(criterion  = "gini", random_state = 0)
dt_gini_val.fit(X_new_train, y_new_train)
y_pred = dt_gini_val.predict(X_val)

roc_auc_score(y_val, y_pred)

0.5625607517010476

# Random Forest


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Finding the best estimator
rf = RandomForestClassifier()

#create a dictionary of all values we want to test for n_estimators
params_rf = {'n_estimators': [50, 100, 200]}

#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)

#fit model to training data
rf_gs.fit(X_train, y_train)

#save best model
rf_best = rf_gs.best_estimator_

#check best n_estimators value
print(rf_gs.best_params_)

{'n_estimators': 100}


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 50)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesRF.csv', index=False)
# Score:  0.65023 (200) /  0.65140 (50)

In [None]:
# Validation
from sklearn.ensemble import RandomForestClassifier

# Finding the best estimator
rf = RandomForestClassifier()

#create a dictionary of all values we want to test for n_estimators
params_rf = {'n_estimators': [50, 100, 200]}

#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)

#fit model to training data
rf_gs.fit(X_new_train, y_new_train)

#save best model
rf_best = rf_gs.best_estimator_

#check best n_estimators value
print(rf_gs.best_params_)


{'n_estimators': 200}


In [None]:
rf_val = RandomForestClassifier(n_estimators = 200)
rf_val.fit(X_new_train, y_new_train)
y_pred = rf_val.predict(X_val)

roc_auc_score(y_val, y_pred)

0.6445080462252942

# Ensemble Learning

In [None]:
from sklearn.ensemble import VotingClassifier

estimators = [('nb', nb), ('Log Regression', log), ('svm', svm), ('rf', rf), ('svm2', svm_model), ('knn', knn), ('dt', dt),
              ('dt_gini', dt_gini), ('dt_entropy', dt_entropy)]
ensemble = VotingClassifier(estimators, voting = 'hard')
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesEnsemble.csv', index=False)
# Score: 0.615

In [None]:
# Validation

from sklearn.ensemble import VotingClassifier

estimators = [('nb', nb_val), ('Log Regression', log_val), ('svm', svm_val), ('svm2', svm_model_val), ('knn', knn_val), ('dt', dt_val),
              ('dt_gini', dt_gini_val), ('dt_entropy', dt_entropy_val)]
ensemble = VotingClassifier(estimators, voting = 'hard')
ensemble.fit(X_new_train, y_new_train)
y_pred = ensemble.predict(X_val)

roc_auc_score(y_val, y_pred)

0.5588346473701263

# Gradient Boosting


In [None]:
# Validation
from sklearn.ensemble import GradientBoostingClassifier

# Define the hyperparameters
params = {
    'learning_rate': 0.05,
    'n_estimators': 200,
    'subsample': 0.8,
    'max_depth': 5,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'random_state': 42
}

# Initialize the GBM classifier with the defined hyperparameters
gbm_classifier = GradientBoostingClassifier(**params)

# Train the classifier
gbm_classifier.fit(X_new_train, y_new_train)

# Predict on the test set
y_pred = gbm_classifier.predict(X_val)

roc_auc_score(y_val, y_pred)

0.6356787990063723

In [None]:

from sklearn.ensemble import GradientBoostingClassifier

# Define the hyperparameters
params = {
    'learning_rate': 0.05,
    'n_estimators': 200,
    'subsample': 0.8,
    'max_depth': 5,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'random_state': 42
}

# Initialize the GBM classifier with the defined hyperparameters
gbm_classifier = GradientBoostingClassifier(**params)

# Train the classifier
gbm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = gbm_classifier.predict(X_test)

# Save the predictions to a CSV file
samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesGBM.csv', index=False)


# XGBoost

In [None]:
import xgboost as xgb

# Define the hyperparameters
params = {
    'learning_rate': 0.2,
    'max_depth': 3,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 200
}

# Initialize the XGBoost classifier with the defined hyperparameters
xgb_classifier = xgb.XGBClassifier(**params)

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_classifier.predict(X_test)

# Save the predictions to a CSV file
samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesXG3.csv', index=False)
# 0.66647 beeeeest

In [None]:
#validation
import xgboost as xgb

# Define the hyperparameters
params = {
    'learning_rate': 0.2,
    'max_depth': 3,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 200
}

# Initialize the XGBoost classifier with the defined hyperparameters
xgb_classifier = xgb.XGBClassifier(**params)

# Train the classifier
xgb_classifier.fit(X_new_train, y_new_train)

# Predict on the test set
y_pred = xgb_classifier.predict(X_val)

roc_auc_score(y_val, y_pred)

0.6156172372826438

In [None]:
import xgboost as xgb

# Define the hyperparameters
params = {
    'learning_rate': 0.2,
    'max_depth': 5,
    'min_child_weight': 3,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 200
}

# Initialize the XGBoost classifier with the defined hyperparameters
xgb_classifier2 = xgb.XGBClassifier(**params)

# Train the classifier
xgb_classifier2.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_classifier2.predict(X_test)

# Save the predictions to a CSV file
samples['Class'] = y_pred
samples_subset = samples[['Name', 'Class']]
samples_subset.to_csv('samplesXG6.csv', index=False)
# 0.65989

In [None]:
# validation 2
import xgboost as xgb

# Define the hyperparameters
params = {
    'learning_rate': 0.2,
    'max_depth': 5,
    'min_child_weight': 3,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 200
}

# Initialize the XGBoost classifier with the defined hyperparameters
xgb_classifier2 = xgb.XGBClassifier(**params)

# Train the classifier
xgb_classifier2.fit(X_new_train, y_new_train)

# Predict on the test set
y_pred = xgb_classifier2.predict(X_val)


roc_auc_score(y_val, y_pred)

0.6292796198293552