In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

train_data = pd.read_csv('https://raw.githubusercontent.com/thanh-nghi/Gradient-Boosting-/refs/heads/main/train.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/thanh-nghi/Gradient-Boosting-/refs/heads/main/test.csv')

In [8]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
y_train = train_data['Survived']
train_data_features = train_data.drop(labels='Survived', axis=1)

full_data = pd.concat([train_data_features, test_data], ignore_index=True)

drop_columns = ['Name', 'Age', 'SibSp', 'Ticket', 'Cabin', 'Parch', 'Embarked']
full_data.drop(labels=drop_columns, axis=1, inplace=True)

full_data = pd.get_dummies(full_data, columns=['Sex'])
full_data.fillna(value=0.0, inplace=True)

X_train = full_data.values[0:891]
X_test = full_data.values[891:]

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=12)

In [11]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
  gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
  gb_clf.fit(X_train, y_train)

  print("Learning rate: ", learning_rate)
  print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
  print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))


Learning rate:  0.05
Accuracy score (training): 0.801
Accuracy score (validation): 0.731
Learning rate:  0.075
Accuracy score (training): 0.814
Accuracy score (validation): 0.731
Learning rate:  0.1
Accuracy score (training): 0.812
Accuracy score (validation): 0.724
Learning rate:  0.25
Accuracy score (training): 0.835
Accuracy score (validation): 0.750
Learning rate:  0.5
Accuracy score (training): 0.864
Accuracy score (validation): 0.772
Learning rate:  0.75
Accuracy score (training): 0.875
Accuracy score (validation): 0.754
Learning rate:  1
Accuracy score (training): 0.875
Accuracy score (validation): 0.739


In [12]:
#use grb model where learning_rate=0.5 to generate predictions
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))

print("Classification Report")
print(classification_report(y_val, predictions))

Confusion Matrix:
[[142  19]
 [ 42  65]]
Classification Report
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       161
           1       0.77      0.61      0.68       107

    accuracy                           0.77       268
   macro avg       0.77      0.74      0.75       268
weighted avg       0.77      0.77      0.77       268



In [13]:
!pip install xgboost



In [14]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

score = xgb_clf.score(X_val, y_val)
print(score)

0.7313432835820896


In [17]:
from numpy import loadtxt
from sklearn.model_selection import KFold, cross_val_score

dataset = loadtxt('https://raw.githubusercontent.com/thanh-nghi/Gradient-Boosting-/refs/heads/main/diabetes.csv', delimiter=',')
X = dataset[:, 0:8]
y = dataset[:,8]

#CV model
model =  XGBClassifier()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, y, cv=kfold)
print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()))

Accuracy: 73.97% (0.06%)


In [19]:
from sklearn.model_selection import StratifiedKFold
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

dataset = loadtxt('https://raw.githubusercontent.com/thanh-nghi/Gradient-Boosting-/refs/heads/main/diabetes.csv', delimiter=',')

X = dataset[:,0:8]
y = dataset[:,8]

model = XGBClassifier()
kfold = StratifiedKFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, y, cv=kfold)
print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))

Accuracy: 73.43% (4.47%)
