In [None]:
import pandas as pd
import math
import numpy as np
from scipy.stats import randint
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


In [None]:
# Load your dataset
# Assuming your dataset is named 'df'
# Replace 'your_data.csv' with the actual file path if your data is stored in a CSV file
# Upload your dataset in Google Colab
from google.colab import files

uploaded = files.upload()

# Load the dataset
file_name = next(iter(uploaded))
df = pd.read_csv(file_name)

Saving Life Expectancy Data.csv to Life Expectancy Data (1).csv


In [None]:
# Data Preparation

# 1. Dealing with duplicates
df = df.drop_duplicates().copy()

# 4. Encoding of categorical variables
# Assuming 'Country' and 'Status' are categorical variables
label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])
df['Status'] = label_encoder.fit_transform(df['Status'])

# 3. Dealing with missing values
# Identify numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=['float64']).columns
non_numeric_cols = df.select_dtypes(exclude=['float64']).columns

# Impute missing values using mean strategy for numeric columns
imputer_numeric = SimpleImputer(strategy='mean')
df.loc[:, numeric_cols] = imputer_numeric.fit_transform(df.loc[:, numeric_cols])

# 2. Outlier Handling
# You can use various methods to handle outliers, like Z-score, IQR, or a robust scaler
# Here, we use Z-score to identify and remove outliers
z_scores = (df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std()
df = df[(z_scores < 3).all(axis=1)].copy()

# Assuming 'Life expectancy' is the column you want to convert
median_life_expectancy = df['Life expectancy '].median()

# Create a new binary column 'Above_Median_Life_Expectancy'
df['Above_Median_Life_Expectancy'] = np.where(df['Life expectancy '] > median_life_expectancy, 1, 0)

# Drop the original 'Life expectancy' column
df = df.drop('Life expectancy ', axis=1)

# 5. Feature Selection
X = df.drop('Above_Median_Life_Expectancy', axis=1)
y = df['Above_Median_Life_Expectancy']

# Using GradientBoostingClassifier for feature selection
gb = GradientBoostingClassifier()
gb.fit(X, y)

# Selecting features based on importance
sfm = SelectFromModel(gb, threshold=0.1)
sfm.fit(X, y)
X_selected = sfm.transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Assuming y_train is a NumPy array or can be converted to one
y_train = np.array(y_train)

# Check unique values in y_train
unique_values = np.unique(y_train)

# If there are more than two unique values, you might need to adjust your data
if len(unique_values) > 2:
    print("Warning: y_train has more than two unique values. Check your data.")

# If there are exactly two unique values, they are likely binary
elif len(unique_values) == 2:
    print("y_train is binary.")

# If there is only one unique value, it might not be suitable for binary classification
else:
    print("Warning: y_train has only one unique value. Check your data.")



y_train is binary.


In [None]:
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [55, 60 ,65, 70, 75],
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5],
    'max_depth': [1, 2, 3, 4 , 5, 6 , 7, 8, 9, 10],
    'min_samples_split': [10, 12, 14, 16, 18, 20],
    'min_samples_leaf': [1, 2, 3, 4 ,5 ,6, 7, 8, 9, 10],
    'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'max_features': ['sqrt', 'log2', None]
}

# Instantiate the GradientBoostingClassifier
gb_model = GradientBoostingClassifier()

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=gb_model, param_distributions=param_grid, scoring='accuracy', cv=5, n_iter=10, random_state=42)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract TP, TN, FP, FN values
tn, fp, fn, tp = conf_matrix.ravel()
print("True Positive (TP):", tp)
print("True Negative (TN):", tn)
print("False Positive (FP):", fp)
print("False Negative (FN):", fn)
# Print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))
print("ROC-AUC Score: {:.2f}".format(roc_auc))
print("Confusion Matrix:\n", conf_matrix)

Best Hyperparameters: {'subsample': 0.9, 'n_estimators': 60, 'min_samples_split': 16, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.15}
True Positive (TP): 232
True Negative (TN): 256
False Positive (FP): 20
False Negative (FN): 21
Accuracy: 0.92
Precision: 0.92
Recall: 0.92
F1 Score: 0.92
ROC-AUC Score: 0.98
Confusion Matrix:
 [[256  20]
 [ 21 232]]
