In [41]:
import pandas as pd
from sklearn.datasets import load_diabetes
import seaborn as sns

df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [137]:
from sklearn.model_selection import train_test_split
X=df.drop("alive",axis=1)
y=df["alive"]
# Assuming you have your data in X (features) and y (target variable)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train, X_test contain the training and testing features respectively
# y_train, y_test contain the training and testing target variables respectively


## Data Cleaning


In [None]:
def remove_characters_from_column(df, column_name, characters_to_remove):
    # Iterate through the characters to remove and replace them with an empty string
    for char in characters_to_remove:
        df[column_name] = df[column_name].str.replace(char, '')

## Handle Missing Values

### Mean Imputer

In [None]:
def fill_with_mean(df,column):
  mean = df[column].mean()
  df[column] =df[column].fillna(mean)

### Median Imputer

In [None]:
def fill_with_median(df,column):
  median = df[column].median()
  df[column] =df[column].fillna(median)

### Mode Imputer

In [None]:
def fill_with_mode(df,column):
  mode = df[column].mode()
  df[column] =df[column].fillna(mode)

In [None]:
import random
def fill_null_with_sampling(data, column_name):
  # Get category counts
  category_counts = data[column_name].value_counts().to_dict()
  total_count = len(data)

  # Iterate through null values
  for index, row in data.iterrows():
    if pd.isna(row[column_name]):
      # Sample a category based on its probability (count / total)
      sampled_category = random.choices(list(category_counts.keys()), weights=list(category_counts.values()))[0]
      data.loc[index, column_name] = sampled_category

  return data

### Iterative Imputer

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import IterativeImputer

def get_iterative_imputer(X_train,X_test):
  imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, random_state=0), max_iter=10, random_state=0)
  X_train_imputed = imputer.fit_transform(X_train)
  X_test_imputed = imputer.transform(X_test)
  return X_train_imputed,X_test_imputed

### KNN-Imputer

In [None]:
from sklearn.impute import KNNImputer

def knn_imputer(X_train,X_test,y_test, model):
  n_value=-1
  for i in range(10):
    knn = KNNImputer(n_neighbors=i)
    X_train_trf = knn.fit_transform(X_train)
    X_test_trf = knn.transform(X_test)
    y_pred=model.predict(X_test_trf)
    t_score=accuracy_score(y_test, y_pred)
  if t_score>score:
    n_value=i
  knn = KNNImputer(n_neighbors=n_value)
  X_train_trf = knn.fit_transform(X_train)
  X_test_trf = knn.transform(X_test)
  return X_train_trf,X_test_trf

## Variance & Bias Tradeoff

In [None]:
from mlxtend.evaluate import bias_variance_decomp
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from mlxtend.data import boston_housing_data
from sklearn.model_selection import train_test_split

In [None]:
def get_bias_variance_tradeoff(model, X_train, y_train, X_test, y_test,alphas=[1,2,3,4,5]):

  loss = []
  bias = []
  variance = []

  for i in alphas:
      reg = model(alpha=i)
      avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
          model, X_train, y_train, X_test, y_test,
          loss='mse',
          random_seed=123)
      loss.append(avg_expected_loss)
      bias.append(avg_bias)
      variance.append(avg_var)
  plt.plot(alphas,loss,label='loss')
  plt.plot(alphas,bias,label='Bias')
  plt.plot(alphas,variance,label='Variance')
  plt.ylim(0,5)
  plt.xlabel('Alpha')
  plt.legend()

# Feature Selection

## wrapper-method

### ExhaustiveFeatureSelector

In [None]:
!pip install --upgrade scikit-learn mlxtend

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.model_selection import cross_val_score
def ExhaustiveFeatureSelector_classification(X,y,max_features=4,cv=5):
  lr = LogisticRegression()
  sel = EFS(lr, max_features=max_features, scoring='accuracy', cv=cv)
  model = sel.fit(X,y)

  print("best_score ".model.best_score_)
  print("best_feature_names",model.best_feature_names_)

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.model_selection import cross_val_score
def ExhaustiveFeatureSelector_regression(X,y,max_features=4,cv=5):
  lr = LinearRegression()
  sel = EFS(lr, max_features=max_features, scoring='r2', cv=cv)
  model = sel.fit(X,y)

  print("best_score ".model.best_score_)
  print("best_feature_names",model.best_feature_names_)

### Sequential Backward Elimination

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
def squuential_backward_elimination(X,y):
  lr = LinearRegression()
  sfs = SFS(lr, k_features='best', forward=False, floating=False, scoring='r2',cv=5)

  sfs.fit(X, y)
  print("K-best features ",sfs.k_feature_idx_)
  X_train_sel = sfs.transform(X)
  return X_train_sel

### Sequential Forward Elimination

In [None]:
def squuential_backward_elimination(X,y):
  lr = LinearRegression()
  sfs = SFS(lr, k_features='best', forward=True, floating=False, scoring='r2',cv=5)

  sfs.fit(X, y)
  print("K-best features ",sfs.k_feature_idx_)
  X_train_sel = sfs.transform(X)
  return X_train_sel

### Mututal Info

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.datasets import load_iris

def get_mutual_info_classification(df,X,y,k=2):
# Create SelectKBest feature selector
  selector = SelectKBest(mutual_info_classif, k=k)

  # Fit and transform
  X_new = selector.fit_transform(X, y)

  # Get columns to keep and create new dataframe with those only
  cols = selector.get_support(indices=True)

  print(df.feature_names)
  print(cols)


In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.datasets import load_iris

def get_mutual_info_classification(df,X,y,k=2):
# Create SelectKBest feature selector
  selector = SelectKBest(mutual_info_regression, k=k)

  # Fit and transform
  X_new = selector.fit_transform(X, y)

  # Get columns to keep and create new dataframe with those only
  cols = selector.get_support(indices=True)

  print(df.feature_names)
  print(cols)


## Filter Methods

### Removing Duplicate Columns

In [None]:
def get_duplicate_columns(df):

    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]

        # Convert column data to bytes
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()

        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column
    for one_list in duplicate_columns.values():
        df.drop(columns=one_list,inplace=True)

    return duplicate_columns

### Variance Threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold
def get_variance_threshold(X,threshold=0.05):
  sel = VarianceThreshold(threshold=threshold)
  sel.fit(X)
  X = sel.transform(X)
  columns = X.columns[sel.get_support()]
  print(columns)
  return X

### Correlation

In [None]:
import seaborn as sns
def  get_correlation(X,threshold=0.5):
  corr_matrix = X.corr()
  columns = corr_matrix.columns

  # Create an empty list to keep track of columns to drop
  columns_to_drop = []
  for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        # Access the cell of the DataFrame
        if corr_matrix.loc[columns[i], columns[j]] > threshold:
            columns_to_drop.append(columns[j])
  columns_to_drop = set(columns_to_drop)
  X.drop(columns = columns_to_drop, axis = 1, inplace=True)
  return X

### ANOVA

In [None]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
import pandas as pd
def anova_classification(X,y,k=100):
  sel = SelectKBest(f_classif, k).fit(X, y)

# display selected feature names
  print(X.columns[sel.get_support()])
  columns = X.columns[sel.get_support()]
  X = sel.transform(X)

  x = pd.DataFrame(X, columns=columns)
  return x

In [None]:
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest
import pandas as pd
def anova_classification(X,y,k=100):
  sel = SelectKBest(f_regression,k).fit(X, y)

# display selected feature names
  print(X.columns[sel.get_support()])
  columns = X.columns[sel.get_support()]
  X = sel.transform(X)

  x = pd.DataFrame(X, columns=columns)
  return x

### Chi-Square

In [None]:
from scipy.stats import chi2_contingency
import pandas as pd
score = []
def get_chi_square(X,target):
  for feature in X.columns[:-1]:

      # create contingency table
      ct = pd.crosstab(X[target], X[feature])

      # chi_test
      p_value = chi2_contingency(ct)[1]
      score.append(p_value)
      pd.Series(score, index=X.columns[:-1]).sort_values(ascending=True).plot(kind='bar')


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt

# assuming titanic is your DataFrame and 'Survived' is the target column
def get_chi_square(X,target):
# Encode categorical variables
  le = LabelEncoder()
  X_encoded = X.apply(le.fit_transform)

  X = X_encoded.drop(target, axis=1)
  y = X_encoded[target]

  # Calculate chi-squared stats
  chi_scores = chi2(X, y)

  # chi_scores[1] are the p-values of each feature.
  p_values = pd.Series(chi_scores[1], index = X.columns)
  p_values.sort_values(inplace = True)

  # Plotting the p-values
  p_values.plot.bar()

  plt.title('Chi-square test - P-values')
  plt.xlabel('Feature')
  plt.ylabel('P-value')


## Embedded Methods

### Reverse Feature Elimination

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Initialize RandomForestClassifier
def get_RFE(X,y):
  model = RandomForestClassifier()

  # Initialize RFE
  rfe = RFE(estimator=model, n_features_to_select=1)

  # Fit RFE
  rfe.fit(X, y)

  # Print the ranking
  ranking = rfe.ranking_
  print("Feature ranking:")

  for i, feature in enumerate(X.columns):
      print(f"{feature}: {ranking[i]}")


### LASSO

In [None]:
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
import numpy as np
def get_features_lasso(X,y,alpha=0.01):

  lasso = Lasso(alpha=alpha)
  cols = X.columns
  lasso.fit(X, y)
  x = pd.Series(np.abs(lasso.coef_),index=cols)

  x.sort_values(ascending=False).plot(kind='bar')


# Dimensionality Reduction

In [None]:
from sklearn.manifold import TSNE
def get_tsne(n_iter_values,perplexity_values):
  fig, axes = plt.subplots(2, 3, figsize=(15, 10))
  axes = axes.flatten()

  for n_iter,perplexity in zip(n_iter_values,perplexity_values):
    tsne = TSNE(n_components=2, perplexity=perplexity, n_iter=n_iter, random_state=42)
    X_tsne = tsne.fit_transform(X)

    ax = axes[i]
    scatter = ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=color, cmap='jet', s=10)
    ax.set_title(f't-SNE with n_iter = {n_iter}')
    ax.axis('off')

# Remove the empty subplot (if any)
    for i in range(len(n_iter_values), len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

SyntaxError: incomplete input (<ipython-input-1-b91bc9a178f0>, line 1)

# Preprocessing

### Standard_Scaler

In [15]:
# pip install target-encoding
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder # future work
from sklearn.preprocessing import OneHotEncoder
!pip install category_encoders
import category_encoders as ce
from category_encoders.target_encoder import TargetEncoder
def scale_and_split(X_train,X_test, features_to_scale,  features_to_convert,scale="StandardScaler",encoder="OneHotEncoder",categories=None,y_test=None, y_train=None):
    encode=None
    scaler=None
    if scale == "StandardScaler":
      scaler = StandardScaler()
      X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
      X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])
    else:
      scaler=MinMaxScaler()
      X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
      X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])
      print(X_test)

    if encoder=="OrdinalEncoder":
      encode= OrdinalEncoder(categories,
                    handle_unknown='use_encoded_value',unknown_value=-1,
                    max_categories=6,encoded_missing_value=-1)

      cat_train = encode.fit_transform(X_train[features_to_convert])

      cat_test = encode.transform(X_test[features_to_convert])

    elif encoder=="OneHotEncoder":

      ohe = OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore', max_categories=10)
      cat_train = ohe.fit_transform(X_train[features_to_convert])
      cat_test = ohe.transform(X_test[features_to_convert])
    # Combine scaled numerical features and encoded categorical features
    elif encoder=='Weight of Evidence':
      encoder = ce.WOEEncoder(cols=features_to_convert,
                    handle_unknown='use_encoded_value',unknown_value=-1,
                    max_categories=6,encoded_missing_value=-1)

      cat_train  = encoder.fit_transform(X_train, y_train)
      cat_test  = encoder.fit_transform(X_test, y_test)
    elif encoder=="Target Encoder":
      encoder = TargetEncoder(smooth=0.0)

      cat_train  = encoder.fit_transform(X_train, y_train)
      cat_test  = encoder.fit_transform(X_test, y_test)

    elif encoder=="Binary Encoder":
      ce.BinaryEncoder(cols=features_to_convert, return_df=True,
                  handle_unknown='use_encoded_value',unknown_value=-1)

      cat_train  = encoder.fit_transform(X_train, y_train)
      cat_test  = encoder.fit_transform(X_test, y_test)
    X_train = pd.concat([X_train[~X_train.columns.isin(features_to_convert)], cat_train], axis=1)
    X_test = pd.concat([X_test[~X_test.columns.isin(features_to_convert)], cat_test], axis=1)
    return X_train, X_test



In [4]:
df1=df.copy()

In [6]:
df1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [16]:
X_train, X_test=scale_and_split(X_train,X_test, features_to_scale=["fare","age"],  features_to_convert=["adult_male"])

ValueError: Item wrong length 14 instead of 712.

## Binning

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles

# Generate the dataset
def get_kernel_pca(X)

# Apply PCA
  pca = PCA()
  X_pca = pca.fit_transform(X)

# Apply Kernel PCA
  kpca = KernelPCA(kernel="rbf", gamma=10)
  X_kpca = kpca.fit_transform(X)

  # Original data plot
  plt.figure(figsize=(16, 4))
  plt.subplot(1, 4, 1)
  plt.scatter(X[:, 0], X[:, 1], c=y)
  plt.title('Original data')

  # Transformed data with PCA in 1D
  plt.subplot(1, 4, 2)
  plt.scatter(X_pca[:, 0], np.zeros((400,)), c=y)
  plt.title('Data after PCA in 1D')

  # Transformed data with Kernel PCA in 2D
  plt.subplot(1, 4, 3)
  plt.scatter(X_kpca[:, 0], X_kpca[:, 1], c=y)
  plt.title('Data after Kernel PCA in 2D')

  # Transformed data with Kernel PCA in 1D
  plt.subplot(1, 4, 4)
  plt.scatter(X_kpca[:, 1], np.zeros((400,)), c=y)
  plt.title('Data after Kernel PCA in 1D')

  plt.tight_layout()
  plt.show()


# HyperParameter Tuning

### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,KFold
def get_best_parameter_GridSearch_CV(grid,MODEL,X,y):
  model=MODEL
  kfold = KFold(n_splits=5, shuffle=True, random_state=1)
  scores = cross_val_score(model, X, y, cv=kfold, scoring='r2')
  gcv = GridSearchCV(model, grid, scoring='r2', refit=True, cv=kfold, verbose=2)
  gcv.fit(X,y)
  return gcv.best_params_

### RandomSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score,KFold
def get_best_parameter_GridSearch_CV(grid,MODEL,X,y):
  model=MODEL
  kfold = KFold(n_splits=5, shuffle=True, random_state=1)
  scores = cross_val_score(model, X, y, cv=kfold, scoring='r2')
  rcv = RandomizedSearchCV(model, grid, scoring='r2', refit=True, cv=kfold, verbose=2)
  rcv.fit(X,y)
  return rcv.best_params_


# Machine Learning


### Logsitic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
def get_binary_logistic_regression(X,y):
  clf = LogisticRegression()
  clf.fit(X,y)
  return clf
  # clf = LogisticRegression(multi_class='multinomial')

In [None]:
from sklearn.linear_model import LogisticRegression
def multinomial_logistic_regression(multi_class_type='multinomial',X,y):
  clf = LogisticRegression(multi_class=multi_class_type)
  clf.fit(X,y)
  return clf
  # clf = LogisticRegression(multi_class='multinomial')

## K-NN Neighbours

In [None]:
def get_k_value(X_train,y_train,X_test,y_test):
  scores = []
  for i in range(1,16):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
  plt.plot(range(1,16),scores)
  plt.show()


## Cross Validation

### Leave One Out Cross Validation (LOOCV)

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut, cross_val_score
def get_LOOCV(MODEL,X,y) :
  model = MODEL()

  # Create a LeaveOneOut cross-validator
  loo = LeaveOneOut()

  # Use cross_val_score for the dataset with the model and LOOCV
  # This will return the scores for each iteration of LOOCV
  scores = cross_val_score(model, X, y, cv=loo, scoring='neg_mean_squared_error')

  mse_scores = -scores  # Invert the sign of the scores

  # Print the mean MSE over all LOOCV iterations
  print("Mean MSE:", mse_scores.mean())

### K-Fold validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import pandas as pd

# Load the Boston Housing dataset
def get_kfold_model(MODEL,X,y,k):

  # Initialize a Linear Regression model
  model = MODEL()

  # Initialize the KFold parameters
  kfold = KFold(n_splits=k, shuffle=True, random_state=42)

  # Use cross_val_score on the model and dataset
  scores = cross_val_score(model, X, y, cv=kfold, scoring='r2')

  print("R2 scores for each fold:", scores)
  print("Mean R2 score across all folds:", scores.mean())


### Stratified K-Folds

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
# Load the Boston Housing dataset
def get_stratified_kfold_model(MODEL,X,y,k):

  # Initialize a Linear Regression model
  model = MODEL()

  # Initialize the KFold parameters
  skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

  # Use cross_val_score on the model and dataset
  scores = cross_val_score(model, X, y, cv=skf, scoring='r2')

  print("R2 scores for each fold:", scores)
  print("Mean R2 score across all folds:", scores.mean())

## Decision Tree

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import export_text
def get_decision_tree(X_train,y_train,max_depth,min_samples_split):
  clf = DecisionTreeClassifier(max_depth=max_depth,min_samples_split=min_samples_split)
  clf.fit(X_train, y_train)
  return clf

In [None]:
def get_decision_tree_visualize(dataset):
  plt.figure(figsize=(12, 8))
  plot_tree(clf, filled=True, feature_names=daraset.feature_names[:2], class_names=dataset.target_names)
  plt.show()


## XgBoost

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor  # For regression task, use XGBClassifier for classification

def tune_xgboost(X_train, y_train, base_params, param_grid, cv=5):
    """
    Tunes XGBoost hyperparameters using GridSearchCV.

    Args:
        X_train (pandas.DataFrame): Training data features.
        y_train (pandas.Series): Training data target variable.
        base_params (dict): Base hyperparameters for XGBoost model.
        param_grid (dict): Grid of hyperparameters to be tuned.
        cv (int, optional): Number of folds for cross-validation. Defaults to 5.

    Returns:
        dict: Best hyperparameters found by GridSearchCV.
    """

    # Create the XGBoost model
    model = XGBRegressor(**base_params)

    # Update param_grid to include lambda and gamma
    param_grid.update({
        "reg_lambda": [0.1, 1.0, 10.0],  # L1 regularization parameter
        "gamma": [0.01, 0.1, 1.0]       # Learning rate shrinkage
    })

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)

    # Fit the grid search to training data
    grid_search.fit(X_train, y_train)

    # Get the best model and best parameters
    best_params = grid_search.best_params_

    # Print the best score (can be accessed from grid_search.best_score_)
    print("Best Score:", grid_search.best_score_)

    return best_params

# Example usage
base_params = {
    "learning_rate": 0.1,
    "n_estimators": 100,
    "objective": "reg:squarederror"  # Change to "binary:logistic" for classification
}

param_grid = {
    "max_depth": [3, 5, 8],
    "min_child_weight": [1, 3, 5],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.7, 1.0]
}

best_params = tune_xgboost(X_train, y_train, base_params, param_grid)

print("Best Parameters:", best_params)


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

def tune_xgboost(X_train, y_train, base_params, param_grid, cv=5):
    """
    Tunes XGBoost hyperparameters using GridSearchCV for classification.

    Args:
        X_train (pandas.DataFrame): Training data features.
        y_train (pandas.Series): Training data target variable.
        base_params (dict): Base hyperparameters for XGBoost model.
        param_grid (dict): Grid of hyperparameters to be tuned.
        cv (int, optional): Number of folds for cross-validation. Defaults to 5.

    Returns:
        dict: Best hyperparameters found by GridSearchCV.
    """

    # Create the XGBoost model for classification
    model = XGBClassifier(**base_params)

    # Update objective for classification (change accordingly for other tasks)
    base_params["objective"] = "binary:logistic"  # Example for binary classification

    # Update param_grid to include relevant parameters for classification
    param_grid.update({
        "max_depth": [3, 5, 8],
        "min_child_weight": [1, 3, 5],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.7, 1.0],
        "reg_lambda": [0.1, 1.0, 10.0],  # L1 regularization parameter
        "gamma": [0.01, 0.1, 1.0]       # Learning rate shrinkage
    })

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)

    # Fit the grid search to training data
    grid_search.fit(X_train, y_train)

    # Get the best model and best parameters
    best_params = grid_search.best_params_

    # Print the best score (can be accessed from grid_search.best_score_)
    print("Best Score:", grid_search.best_score_)

    return best_params

### CatBoost

In [51]:
!pip install catboost
!pip install ipywidgets
!pip install shap
!pip install sklearn
# !jupyter nbextension enable --py widgetsnbextension

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [53]:
!python --version

Python 3.10.12


In [18]:
import os
import pandas as pd
import numpy as np

import catboost
print(catboost.__version__)

1.2.5


In [36]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
categorical_column_numbers = [int(df.columns.get_loc(col)) for col in categorical_columns]
print(categorical_column_numbers)

[2, 7, 8, 9, 11, 12, 13]


In [61]:
X_train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
331,0,1,male,45.5,0,0,28.5,S,First,man,True,C,Southampton,True
733,0,2,male,23.0,0,0,13.0,S,Second,man,True,,Southampton,True
382,0,3,male,32.0,0,0,7.925,S,Third,man,True,,Southampton,True
704,0,3,male,26.0,1,0,7.8542,S,Third,man,True,,Southampton,False
813,0,3,female,6.0,4,2,31.275,S,Third,child,False,,Southampton,False


In [73]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    object  
 3   age          572 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   class        712 non-null    category
 8   who          712 non-null    object  
 9   adult_male   712 non-null    bool    
 10  embark_town  710 non-null    object  
 11  alone        712 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(3)
memory usage: 57.8+ KB


In [94]:
X_test1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 709 to 10
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     179 non-null    int64   
 1   pclass       179 non-null    int64   
 2   sex          179 non-null    object  
 3   age          142 non-null    float64 
 4   sibsp        179 non-null    int64   
 5   parch        179 non-null    int64   
 6   fare         179 non-null    float64 
 7   class        179 non-null    category
 8   who          179 non-null    object  
 9   adult_male   179 non-null    bool    
 10  embark_town  179 non-null    object  
 11  alone        179 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(3)
memory usage: 14.6+ KB


In [65]:
X_train1=X_train.drop("embarked",axis=1)

In [74]:
X_train1=X_train1.drop("embark_town",axis=1)

In [114]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 140 to 684
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    object  
 3   age          571 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     710 non-null    object  
 8   class        712 non-null    category
 9   who          712 non-null    object  
 10  adult_male   712 non-null    bool    
 11  deck         162 non-null    category
 12  embark_town  710 non-null    object  
 13  alone        712 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(4)
memory usage: 64.4+ KB


In [139]:
X_train2=X_train.copy()
df_object_columns = X_train.select_dtypes(['object'])
X_train2[df_object_columns.columns] = df_object_columns.astype('category')

In [160]:
X_test2=X_train.copy()
df_object_columns = X_test2.select_dtypes(['object'])
X_test2[df_object_columns.columns] = df_object_columns.astype('category')

In [162]:
X_test2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    category
 3   age          572 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     710 non-null    category
 8   class        712 non-null    category
 9   who          712 non-null    category
 10  adult_male   712 non-null    bool    
 11  deck         158 non-null    category
 12  embark_town  710 non-null    category
 13  alone        712 non-null    bool    
dtypes: bool(2), category(6), float64(2), int64(4)
memory usage: 45.5 KB


In [121]:
X_train2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 140 to 684
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    category
 3   age          571 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     710 non-null    category
 8   class        712 non-null    category
 9   who          712 non-null    category
 10  adult_male   712 non-null    bool    
 11  deck         162 non-null    category
 12  embark_town  710 non-null    category
 13  alone        712 non-null    bool    
dtypes: bool(2), category(6), float64(2), int64(4)
memory usage: 45.5 KB


In [123]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=30)
cat=[2,8,7,10]
model.fit(X_train1, y_train, cat_features= cat, verbose=5)

Learning rate set to 0.222036
0:	learn: 0.6775303	total: 3.03ms	remaining: 87.8ms
5:	learn: 0.6446524	total: 20.5ms	remaining: 82.1ms
10:	learn: 0.6292284	total: 39.9ms	remaining: 68.9ms
15:	learn: 0.6230963	total: 69.6ms	remaining: 60.9ms
20:	learn: 0.6201971	total: 92.2ms	remaining: 39.5ms
25:	learn: 0.6163128	total: 108ms	remaining: 16.5ms
29:	learn: 0.6114779	total: 127ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x79069eb7ca60>

In [76]:
X_test.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
709,1,3,male,,1,1,15.2458,C,Third,man,True,,Cherbourg,False
439,0,2,male,31.0,0,0,10.5,S,Second,man,True,,Southampton,True
840,0,3,male,20.0,0,0,7.925,S,Third,man,True,,Southampton,True
720,1,2,female,6.0,0,1,33.0,S,Second,child,False,,Southampton,False
39,1,3,female,14.0,1,0,11.2417,C,Third,child,False,,Cherbourg,False


In [97]:
X_test1=X_test.drop(["embarked","embark_town","deck"],axis=1)
# X_

331     no
733     no
382     no
704     no
813     no
      ... 
106    yes
270     no
860     no
435    yes
102     no
Name: alive, Length: 712, dtype: object

In [124]:
model.predict(X_test1)

array(['yes', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no',
       'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'yes', 'no', 'no',
       'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'yes', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'n

In [31]:
print(categorical_column_numbers)

[2, 7, 8, 9, 11, 12, 13]


In [30]:
X

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,True


In [125]:
from sklearn.model_selection import train_test_split
from catboost import Pool
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

train_pool = Pool(
    data=X_train1,
    label=y_train,
    cat_features=cat
)

validation_pool = Pool(
    data=X_test1,
    label=y_test,
    cat_features=cat
)

In [127]:
model = CatBoostClassifier(
    iterations=60,
    # learning_rate=0.01,
    # loss_function='RMSE',
    loss_function="Logloss"
)

model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=5,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.099078
0:	learn: 0.6858041	test: 0.6893087	best: 0.6893087 (0)	total: 1.93ms	remaining: 114ms
5:	learn: 0.6635133	test: 0.6744765	best: 0.6744765 (5)	total: 20.5ms	remaining: 185ms
10:	learn: 0.6492246	test: 0.6687541	best: 0.6687541 (10)	total: 34.8ms	remaining: 155ms
15:	learn: 0.6423801	test: 0.6664487	best: 0.6664487 (15)	total: 58.5ms	remaining: 161ms
20:	learn: 0.6369486	test: 0.6655503	best: 0.6655503 (20)	total: 70.8ms	remaining: 131ms
25:	learn: 0.6326906	test: 0.6636972	best: 0.6636628 (24)	total: 85.6ms	remaining: 112ms
30:	learn: 0.6300873	test: 0.6634682	best: 0.6626156 (28)	total: 99.7ms	remaining: 93.3ms
35:	learn: 0.6253871	test: 0.6654867	best: 0.6626156 (28)	total: 112ms	remaining: 74.8ms
40:	learn: 0.6222260	test: 0.6653738	best: 0.6626156 (28)	total: 127ms	remaining: 58.7ms
45:	learn: 0.6197984	test: 0.6658377	best: 0.6626156 (28)	total: 153ms	remaining: 46.7ms
50:	learn: 0.6161007	test: 0.6665236	best: 0.6626156 (28)	total: 171ms	remaining: 3

<catboost.core.CatBoostClassifier at 0x79069eb7f790>

In [112]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "learning_rate": [0.001, 0.01, 0.5],
}

clf = CatBoostClassifier(
    iterations=20,
    cat_features=cat,
    verbose=20
)
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3)
results = grid_search.fit(X_train1, y_train)
results.best_estimator_.get_params()

0:	learn: 0.6930596	total: 2.74ms	remaining: 52ms
19:	learn: 0.6915104	total: 59.5ms	remaining: 0us
0:	learn: 0.6930835	total: 1.74ms	remaining: 33.1ms
19:	learn: 0.6918773	total: 50.5ms	remaining: 0us
0:	learn: 0.6930744	total: 3.2ms	remaining: 60.8ms
19:	learn: 0.6918646	total: 56.5ms	remaining: 0us
0:	learn: 0.6922738	total: 5.28ms	remaining: 100ms
19:	learn: 0.6786009	total: 58.3ms	remaining: 0us
0:	learn: 0.6925121	total: 3.68ms	remaining: 69.9ms
19:	learn: 0.6819714	total: 53.2ms	remaining: 0us
0:	learn: 0.6924216	total: 1.81ms	remaining: 34.5ms
19:	learn: 0.6812496	total: 49.3ms	remaining: 0us
0:	learn: 0.6571487	total: 1.78ms	remaining: 33.8ms
19:	learn: 0.5525026	total: 52.8ms	remaining: 0us
0:	learn: 0.6667418	total: 1.1ms	remaining: 20.9ms
19:	learn: 0.5589241	total: 26.8ms	remaining: 0us
0:	learn: 0.6634240	total: 1.74ms	remaining: 33.1ms
19:	learn: 0.5768520	total: 51.8ms	remaining: 0us
0:	learn: 0.6923786	total: 2ms	remaining: 38.1ms
19:	learn: 0.6811288	total: 37.5ms	rem

{'iterations': 20,
 'learning_rate': 0.01,
 'verbose': 20,
 'cat_features': [2, 8, 7, 10]}

### LightGBM

In [133]:
import numpy as np
import lightgbm as lgb
import time
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMClassifier
# Generate synthetic data
X, y = make_regression(n_samples=10000, n_features=10, noise=0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the parameters for LightGBM
params_level = {
    'boosting_type': 'gbdt',  # depth-wise growth
    'objective': 'regression',
    'metric': 'mse',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'verbose': -1
}

params_leaf = {
    'boosting_type': 'dart',  # Leaf-wise growth
    'objective': 'classification',
    'metric': 'logloss',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'verbose': -1
}

# # Training the model using level-wise growth
# start_time = time.time()
# model_level = lgb.train(params_level, lgb.Dataset(X_train, label=y_train), num_boost_round=100)
# level_time = time.time() - start_time
# y_pred_level = model_level.predict(X_test)
# mse_level = mean_squared_error(y_test, y_pred_level)

# Training the model using leaf-wise growth
start_time = time.time()
lgb=LGBMClassifier()
model_leaf = lgb.train(params_leaf, lgb.Dataset(X_train, label=y_train), num_boost_round=100)
leaf_time = time.time() - start_time
y_pred_leaf = model_leaf.predict(X_test)
# mse_leaf = mean_squared_error(y_test, y_pred_leaf)

# Output the results
# print(f"Depth-wise growth time: {level_time:.4f} seconds, MSE: {mse_level:.4f}")
print(f"Leaf-wise growth time: {leaf_time:.4f} seconds, MSE: {mse_leaf:.4f}")


AttributeError: 'LGBMClassifier' object has no attribute 'train'

In [146]:
lgb=LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42),

In [140]:
X_train2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    category
 3   age          572 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     710 non-null    category
 8   class        712 non-null    category
 9   who          712 non-null    category
 10  adult_male   712 non-null    bool    
 11  deck         158 non-null    category
 12  embark_town  710 non-null    category
 13  alone        712 non-null    bool    
dtypes: bool(2), category(6), float64(2), int64(4)
memory usage: 45.5 KB


In [148]:
model = LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

In [155]:
import pandas as pd

# Assuming df is your DataFrame
categorical_columns = X_train2.select_dtypes(include=[ 'category']).columns
# categorical_column_numbers = [X_train2.columns.get_loc(col) for col in categorical_columns]
print(categorical_columns.tolist())


['sex', 'embarked', 'class', 'who', 'deck', 'embark_town']


In [158]:
model.fit(X_train2, y_train, categorical_feature=['sex', 'embarked', 'class', 'who', 'deck', 'embark_town'])

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


In [167]:
y_pred = model.predict(X_test2)
y_pred
from sklearn.metrics import accuracy_score,confusion_matrix
# def get_accuracy(y_test,y_pred):
#   print("Accuracy of Logistic Regression",accuracy_score(y_test,y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



ValueError: Found input variables with inconsistent numbers of samples: [179, 712]

# Evaluation Metrics

## Regression

### Adjusted R2_Score

In [None]:
def adjust_r2(r2, num_examples, num_features):
    coef = (num_examples - 1) / (num_examples - num_features - 1)
    return 1 - (1 - r2) * coef

## Classification

### Accuracy

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
def get_accuracy(y_test,y_pred):
  print("Accuracy of Logistic Regression",accuracy_score(y_test,y_pred))

### Recall

In [None]:
from sklearn.metrics import recall_score

In [None]:
def get_recall_simple(y_test,y_pred):
  print(recall_score(y_test,y_pred,average=None))

In [None]:
def get_recall_weighted(y_test,y_pred):
  print(recall_score(y_test,y_pred,average='weighted'))

### F1-Score

In [None]:
from sklearn.metrics import f1_score
def get_classification_scores(y_test,y_pred1):
  print("F1 score - ",f1_score(y_test,y_pred1))

### Precision

In [None]:
from sklearn.metrics import recall_score,precision_score,f1_score

In [None]:
def get_classification_scores(y_test,y_pred1):
  print("-"*50)
  cdf = pd.DataFrame(confusion_matrix(y_test,y_pred1),columns=list(range(0,2)))
  print(cdf)
  print("-"*50)
  print("Precision - ",precision_score(y_test,y_pred1))
  print("Recall - ",recall_score(y_test,y_pred1))
  print("F1 score - ",f1_score(y_test,y_pred1))

In [None]:
def get_precisoon_simple(y_test,y_pred):
  print(precision_score(y_test,y_pred,average=None))

In [None]:
def get_precisoon_weighted(y_test,y_pred):
  print(precision_score(y_test,y_pred,average='weighted'))

### Confusion Matrix

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
def get_confusion_matrix(y_test,y_pred1,n):
  print(pd.DataFrame(confusion_matrix(y_test,y_pred1),columns=list(range(0,n))))

### Classification Report

In [None]:
from sklearn.metrics import classification_report
def get_classification_report_multiclass(y_test,y_pred1):
  print(classification_report(y_test,y_pred1))

### ROC-Curve

In [None]:
from sklearn.metrics import roc_curve
import plotly.graph_objects as go
import numpy as np



In [None]:
def get_roc_curve(y_test,y_scores):
  fpr, tpr, thresholds = roc_curve(y_test, y_scores)
  # Generate a trace for ROC curve
  trace0 = go.Scatter(
      x=fpr,
      y=tpr,
      mode='lines',
      name='ROC curve'
  )

  # Only label every nth point to avoid cluttering
  n = 10
  indices = np.arange(len(thresholds)) % n == 0  # Choose indices where index mod n is 0

  trace1 = go.Scatter(
      x=fpr[indices],
      y=tpr[indices],
      mode='markers+text',
      name='Threshold points',
      text=[f"Thr={thr:.2f}" for thr in thresholds[indices]],
      textposition='top center'
  )


  # Diagonal line
  trace2 = go.Scatter(
      x=[0, 1],
      y=[0, 1],
      mode='lines',
      name='Random (Area = 0.5)',
      line=dict(dash='dash')
  )

  data = [trace0, trace1, trace2]

  # Define layout with square aspect ratio
  layout = go.Layout(
      title='Receiver Operating Characteristic',
      xaxis=dict(title='False Positive Rate'),
      yaxis=dict(title='True Positive Rate'),
      autosize=False,
      width=800,
      height=800,
      showlegend=False
  )

  # Define figure and add data
  fig = go.Figure(data=data, layout=layout)

  # Show figure
  fig.show()
  optimal_idx = np.argmax(tpr - fpr)
  optimal_threshold = thresholds[optimal_idx]
  print("Optimal threshold is:", optimal_threshold)
  return optimal_threshold