In [None]:
import torch
import torchvision
print(torch.__version__)
print(torchvision.__version__)

In [5]:
import optuna

import logging
import sys

import sklearn.datasets
import sklearn.linear_model
import sklearn.model_selection


def objective(trial):
    iris = sklearn.datasets.load_iris()
    classes = list(set(iris.target))
    train_x, valid_x, train_y, valid_y = sklearn.model_selection.train_test_split(
        iris.data, iris.target, test_size=0.25, random_state=0
    )

    alpha = trial.suggest_float("alpha", 1e-5, 1e-1, log=True)
    clf = sklearn.linear_model.SGDClassifier(alpha=alpha)

    for step in range(100):
        clf.partial_fit(train_x, train_y, classes=classes)

        # Report intermediate objective value.
        intermediate_value = 1.0 - clf.score(valid_x, valid_y)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.TrialPruned()

    return 1.0 - clf.score(valid_x, valid_y)

# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study = optuna.create_study(pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=20)

del study

[I 2024-09-26 10:47:59,864] A new study created in memory with name: no-name-5b53daa7-b99a-4456-bd3b-418695a5d2e8


A new study created in memory with name: no-name-5b53daa7-b99a-4456-bd3b-418695a5d2e8
A new study created in memory with name: no-name-5b53daa7-b99a-4456-bd3b-418695a5d2e8
A new study created in memory with name: no-name-5b53daa7-b99a-4456-bd3b-418695a5d2e8
A new study created in memory with name: no-name-5b53daa7-b99a-4456-bd3b-418695a5d2e8
A new study created in memory with name: no-name-5b53daa7-b99a-4456-bd3b-418695a5d2e8


[I 2024-09-26 10:47:59,948] Trial 0 finished with value: 0.21052631578947367 and parameters: {'alpha': 0.034841676083516224}. Best is trial 0 with value: 0.21052631578947367.


Trial 0 finished with value: 0.21052631578947367 and parameters: {'alpha': 0.034841676083516224}. Best is trial 0 with value: 0.21052631578947367.
Trial 0 finished with value: 0.21052631578947367 and parameters: {'alpha': 0.034841676083516224}. Best is trial 0 with value: 0.21052631578947367.
Trial 0 finished with value: 0.21052631578947367 and parameters: {'alpha': 0.034841676083516224}. Best is trial 0 with value: 0.21052631578947367.
Trial 0 finished with value: 0.21052631578947367 and parameters: {'alpha': 0.034841676083516224}. Best is trial 0 with value: 0.21052631578947367.
Trial 0 finished with value: 0.21052631578947367 and parameters: {'alpha': 0.034841676083516224}. Best is trial 0 with value: 0.21052631578947367.


[I 2024-09-26 10:48:00,030] Trial 1 finished with value: 0.07894736842105265 and parameters: {'alpha': 5.024988407245832e-05}. Best is trial 1 with value: 0.07894736842105265.


Trial 1 finished with value: 0.07894736842105265 and parameters: {'alpha': 5.024988407245832e-05}. Best is trial 1 with value: 0.07894736842105265.
Trial 1 finished with value: 0.07894736842105265 and parameters: {'alpha': 5.024988407245832e-05}. Best is trial 1 with value: 0.07894736842105265.
Trial 1 finished with value: 0.07894736842105265 and parameters: {'alpha': 5.024988407245832e-05}. Best is trial 1 with value: 0.07894736842105265.
Trial 1 finished with value: 0.07894736842105265 and parameters: {'alpha': 5.024988407245832e-05}. Best is trial 1 with value: 0.07894736842105265.
Trial 1 finished with value: 0.07894736842105265 and parameters: {'alpha': 5.024988407245832e-05}. Best is trial 1 with value: 0.07894736842105265.


[I 2024-09-26 10:48:00,112] Trial 2 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.008605347915629629}. Best is trial 2 with value: 0.052631578947368474.


Trial 2 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.008605347915629629}. Best is trial 2 with value: 0.052631578947368474.
Trial 2 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.008605347915629629}. Best is trial 2 with value: 0.052631578947368474.
Trial 2 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.008605347915629629}. Best is trial 2 with value: 0.052631578947368474.
Trial 2 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.008605347915629629}. Best is trial 2 with value: 0.052631578947368474.
Trial 2 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.008605347915629629}. Best is trial 2 with value: 0.052631578947368474.


[I 2024-09-26 10:48:00,194] Trial 3 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.0002930657202688273}. Best is trial 2 with value: 0.052631578947368474.


Trial 3 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.0002930657202688273}. Best is trial 2 with value: 0.052631578947368474.
Trial 3 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.0002930657202688273}. Best is trial 2 with value: 0.052631578947368474.
Trial 3 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.0002930657202688273}. Best is trial 2 with value: 0.052631578947368474.
Trial 3 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.0002930657202688273}. Best is trial 2 with value: 0.052631578947368474.
Trial 3 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.0002930657202688273}. Best is trial 2 with value: 0.052631578947368474.


[I 2024-09-26 10:48:00,301] Trial 4 finished with value: 0.052631578947368474 and parameters: {'alpha': 1.6514214919262076e-05}. Best is trial 2 with value: 0.052631578947368474.


Trial 4 finished with value: 0.052631578947368474 and parameters: {'alpha': 1.6514214919262076e-05}. Best is trial 2 with value: 0.052631578947368474.
Trial 4 finished with value: 0.052631578947368474 and parameters: {'alpha': 1.6514214919262076e-05}. Best is trial 2 with value: 0.052631578947368474.
Trial 4 finished with value: 0.052631578947368474 and parameters: {'alpha': 1.6514214919262076e-05}. Best is trial 2 with value: 0.052631578947368474.
Trial 4 finished with value: 0.052631578947368474 and parameters: {'alpha': 1.6514214919262076e-05}. Best is trial 2 with value: 0.052631578947368474.
Trial 4 finished with value: 0.052631578947368474 and parameters: {'alpha': 1.6514214919262076e-05}. Best is trial 2 with value: 0.052631578947368474.


[I 2024-09-26 10:48:00,384] Trial 5 finished with value: 0.2894736842105263 and parameters: {'alpha': 0.04950249118875089}. Best is trial 2 with value: 0.052631578947368474.


Trial 5 finished with value: 0.2894736842105263 and parameters: {'alpha': 0.04950249118875089}. Best is trial 2 with value: 0.052631578947368474.
Trial 5 finished with value: 0.2894736842105263 and parameters: {'alpha': 0.04950249118875089}. Best is trial 2 with value: 0.052631578947368474.
Trial 5 finished with value: 0.2894736842105263 and parameters: {'alpha': 0.04950249118875089}. Best is trial 2 with value: 0.052631578947368474.
Trial 5 finished with value: 0.2894736842105263 and parameters: {'alpha': 0.04950249118875089}. Best is trial 2 with value: 0.052631578947368474.
Trial 5 finished with value: 0.2894736842105263 and parameters: {'alpha': 0.04950249118875089}. Best is trial 2 with value: 0.052631578947368474.


[I 2024-09-26 10:48:00,387] Trial 6 pruned. 


Trial 6 pruned. 
Trial 6 pruned. 
Trial 6 pruned. 
Trial 6 pruned. 
Trial 6 pruned. 


[I 2024-09-26 10:48:00,390] Trial 7 pruned. 


Trial 7 pruned. 
Trial 7 pruned. 
Trial 7 pruned. 
Trial 7 pruned. 
Trial 7 pruned. 


[I 2024-09-26 10:48:00,394] Trial 8 pruned. 


Trial 8 pruned. 
Trial 8 pruned. 
Trial 8 pruned. 
Trial 8 pruned. 
Trial 8 pruned. 


[I 2024-09-26 10:48:00,477] Trial 9 finished with value: 0.39473684210526316 and parameters: {'alpha': 1.2331981507790151e-05}. Best is trial 2 with value: 0.052631578947368474.


Trial 9 finished with value: 0.39473684210526316 and parameters: {'alpha': 1.2331981507790151e-05}. Best is trial 2 with value: 0.052631578947368474.
Trial 9 finished with value: 0.39473684210526316 and parameters: {'alpha': 1.2331981507790151e-05}. Best is trial 2 with value: 0.052631578947368474.
Trial 9 finished with value: 0.39473684210526316 and parameters: {'alpha': 1.2331981507790151e-05}. Best is trial 2 with value: 0.052631578947368474.
Trial 9 finished with value: 0.39473684210526316 and parameters: {'alpha': 1.2331981507790151e-05}. Best is trial 2 with value: 0.052631578947368474.
Trial 9 finished with value: 0.39473684210526316 and parameters: {'alpha': 1.2331981507790151e-05}. Best is trial 2 with value: 0.052631578947368474.


[I 2024-09-26 10:48:00,561] Trial 10 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.003058178743420022}. Best is trial 2 with value: 0.052631578947368474.


Trial 10 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.003058178743420022}. Best is trial 2 with value: 0.052631578947368474.
Trial 10 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.003058178743420022}. Best is trial 2 with value: 0.052631578947368474.
Trial 10 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.003058178743420022}. Best is trial 2 with value: 0.052631578947368474.
Trial 10 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.003058178743420022}. Best is trial 2 with value: 0.052631578947368474.
Trial 10 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.003058178743420022}. Best is trial 2 with value: 0.052631578947368474.


[I 2024-09-26 10:48:00,566] Trial 11 pruned. 


Trial 11 pruned. 
Trial 11 pruned. 
Trial 11 pruned. 
Trial 11 pruned. 
Trial 11 pruned. 


[I 2024-09-26 10:48:00,570] Trial 12 pruned. 


Trial 12 pruned. 
Trial 12 pruned. 
Trial 12 pruned. 
Trial 12 pruned. 
Trial 12 pruned. 


[I 2024-09-26 10:48:00,574] Trial 13 pruned. 


Trial 13 pruned. 
Trial 13 pruned. 
Trial 13 pruned. 
Trial 13 pruned. 
Trial 13 pruned. 


[I 2024-09-26 10:48:00,579] Trial 14 pruned. 


Trial 14 pruned. 
Trial 14 pruned. 
Trial 14 pruned. 
Trial 14 pruned. 
Trial 14 pruned. 


[I 2024-09-26 10:48:00,583] Trial 15 pruned. 


Trial 15 pruned. 
Trial 15 pruned. 
Trial 15 pruned. 
Trial 15 pruned. 
Trial 15 pruned. 


[I 2024-09-26 10:48:00,587] Trial 16 pruned. 


Trial 16 pruned. 
Trial 16 pruned. 
Trial 16 pruned. 
Trial 16 pruned. 
Trial 16 pruned. 


[I 2024-09-26 10:48:00,591] Trial 17 pruned. 


Trial 17 pruned. 
Trial 17 pruned. 
Trial 17 pruned. 
Trial 17 pruned. 
Trial 17 pruned. 


[I 2024-09-26 10:48:00,675] Trial 18 finished with value: 0.02631578947368418 and parameters: {'alpha': 0.0011422529345791776}. Best is trial 18 with value: 0.02631578947368418.


Trial 18 finished with value: 0.02631578947368418 and parameters: {'alpha': 0.0011422529345791776}. Best is trial 18 with value: 0.02631578947368418.
Trial 18 finished with value: 0.02631578947368418 and parameters: {'alpha': 0.0011422529345791776}. Best is trial 18 with value: 0.02631578947368418.
Trial 18 finished with value: 0.02631578947368418 and parameters: {'alpha': 0.0011422529345791776}. Best is trial 18 with value: 0.02631578947368418.
Trial 18 finished with value: 0.02631578947368418 and parameters: {'alpha': 0.0011422529345791776}. Best is trial 18 with value: 0.02631578947368418.
Trial 18 finished with value: 0.02631578947368418 and parameters: {'alpha': 0.0011422529345791776}. Best is trial 18 with value: 0.02631578947368418.


[I 2024-09-26 10:48:00,680] Trial 19 pruned. 


Trial 19 pruned. 
Trial 19 pruned. 
Trial 19 pruned. 
Trial 19 pruned. 
Trial 19 pruned. 


In [None]:
import optuna

study = optuna.create_study() # default
print(f"Sampler is {study.sampler.__class__.__name__}")

study = optuna.create_study(sampler=optuna.samplers.RandomSampler())
print(f"Sampler is {study.sampler.__class__.__name__}")

study = optuna.create_study(sampler=optuna.samplers.CmaEsSampler())
print(f"Sampler is {study.sampler.__class__.__name__}")

In [None]:
!yolo predict model=yolov8n.pt source='https://ultralytics.com/images/zidane.jpg'

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

# Load image
image = cv2.imread('your_image.jpg')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Convert image from RGB to HSV
hsv_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)

# Function to adjust Hue, Saturation, or Value independently
def adjust_hsv(hsv_img, h_shift=0, s_scale=1, v_scale=1):
    hsv_copy = hsv_img.copy().astype(np.float32)
    
    # Adjust Hue
    hsv_copy[..., 0] = (hsv_copy[..., 0] + h_shift) % 180
    
    # Adjust Saturation
    hsv_copy[..., 1] = hsv_copy[..., 1] * s_scale
    hsv_copy[..., 1] = np.clip(hsv_copy[..., 1], 0, 255)
    
    # Adjust Value
    hsv_copy[..., 2] = hsv_copy[..., 2] * v_scale
    hsv_copy[..., 2] = np.clip(hsv_copy[..., 2], 0, 255)
    
    # Convert back to uint8
    hsv_copy = hsv_copy.astype(np.uint8)
    
    # Convert back to RGB for visualization
    return cv2.cvtColor(hsv_copy, cv2.COLOR_HSV2RGB)

# Create subplots for visualization
fig, ax = plt.subplots(3, 4, figsize=(15, 10))

# Original image
ax[0, 0].imshow(image)
ax[0, 0].set_title("Original")
ax[0, 0].axis('off')

# Hue adjustments (shift hue)
for i, h_shift in enumerate([30, 60, 90]):
    ax[0, i+1].imshow(adjust_hsv(hsv_image, h_shift=h_shift))
    ax[0, i+1].set_title(f'Hue Shift: {h_shift}')
    ax[0, i+1].axis('off')

# Saturation adjustments (scale saturation)
for i, s_scale in enumerate([0.5, 1.5, 2]):
    ax[1, i+1].imshow(adjust_hsv(hsv_image, s_scale=s_scale))
    ax[1, i+1].set_title(f'Saturation x {s_scale}')
    ax[1, i+1].axis('off')

# Value adjustments (scale brightness)
for i, v_scale in enumerate([0.5, 1.5, 2]):
    ax[2, i+1].imshow(adjust_hsv(hsv_image, v_scale=v_scale))
    ax[2, i+1].set_title(f'Value x {v_scale}')
    ax[2, i+1].axis('off')

# Hide any unused axes
for j in range(3):
    ax[j, 0].axis('off')

plt.tight_layout()
plt.show()


In [None]:
# One-sample t-test
import numpy as np
from scipy import stats

# Sample data: weights of 30 products
sample_weights = np.array([
    49.8, 50.2, 50.0, 49.5, 50.1, 50.3, 49.9, 50.4, 50.2, 49.7,
    50.0, 50.1, 49.6, 50.3, 50.2, 49.8, 50.0, 50.1, 49.9, 50.2,
    50.0, 49.7, 50.3, 50.1, 49.8, 50.0, 50.2, 49.9, 50.1, 50.0
])

# Population mean
mu = 50.0

# Perform one-sample t-test
t_stat, p_value = stats.ttest_1samp(sample_weights, mu)

print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")


In [None]:
# Independent two-sample t-test
import numpy as np
from scipy import stats

# Sample data: heights of 30 males and 30 females
male_heights = np.array([
    175, 180, 178, 182, 176, 179, 181, 177, 183, 175,
    180, 178, 182, 176, 179, 181, 177, 183, 175, 180,
    178, 182, 176, 179, 181, 177, 183, 175, 180, 178
])

female_heights = np.array([
    165, 160, 162, 158, 161, 159, 163, 160, 162, 161,
    165, 160, 162, 158, 161, 159, 163, 160, 162, 161,
    165, 160, 162, 158, 161, 159, 163, 160, 162, 161
])

# Perform Levene's test for equal variances
levene_stat, levene_p = stats.levene(male_heights, female_heights)
print(f"Levene's test p-value: {levene_p:.4f}")

# Decide whether to assume equal variances
if levene_p > 0.05:
    equal_var = True
    print("Equal variances assumed.")
else:
    equal_var = False
    print("Equal variances not assumed. Using Welch's t-test.")

# Perform independent two-sample t-test
t_stat, p_value = stats.ttest_ind(male_heights, female_heights, equal_var=equal_var)

print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")

In [None]:
# Paired t-test
import numpy as np
from scipy import stats

# Sample data: weights before and after a diet program for 30 individuals
before_weights = np.array([
    80, 82, 78, 85, 77, 79, 81, 80, 83, 78,
    80, 82, 78, 85, 77, 79, 81, 80, 83, 78,
    80, 82, 78, 85, 77, 79, 81, 80, 83, 78
])

after_weights = np.array([
    78, 80, 76, 83, 75, 77, 79, 78, 81, 76,
    78, 80, 76, 83, 75, 77, 80, 78, 81, 76,
    78, 80, 76, 83, 75, 77, 79, 78, 81, 76
])

# Perform paired t-test
t_stat, p_value = stats.ttest_rel(before_weights, after_weights)

print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")

In [None]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

gaussian_with_outliners = False
uniform_and_expnential = True

if gaussian_with_outliners:
    # Define the domains for the data
    domain_1 = np.random.normal(1.5, 0.1, 1000)  # Normal distribution with mean 1.5
    domain_2 = np.random.normal(4.5, 0.1, 1000)  # Normal distribution with mean 2.5

    # Add 10 outliers: 8 to the first domain, and 2 to the second domain
    outliers_1 = np.full(20, 3)  # 10 outliers with value 8
    outliers_2 = np.full(20, 3)  # 10 outliers with value 2

    # Append the outliers to the domains
    domain_1_with_outliers = np.append(domain_1, outliers_1)
    domain_2_with_outliers = np.append(domain_2, outliers_2)
elif uniform_and_expnential:
    domain_1 = np.random.uniform(0.5, 1.5, 1000)  # Uniform distribution between 0.5 and 1.5
    domain_2 = np.random.exponential(1.0, 1000)   # Exponential distribution with lambda=1.0

    domain_1_with_outliers = domain_1
    domain_2_with_outliers = domain_2
else:
    domain_1 = np.random.normal(1.5, 0.1, 1000)  # Normal distribution with mean 1.5
    domain_2 = np.random.normal(4.5, 0.1, 1000)  # Normal distribution with mean 2.5
    

# Apply normalization (Min-Max Scaling)
scaler = MinMaxScaler()
normalized_1_with_outliers = scaler.fit_transform(domain_1_with_outliers.reshape(-1, 1)).flatten()
normalized_2_with_outliers = scaler.fit_transform(domain_2_with_outliers.reshape(-1, 1)).flatten()

# Apply standardization (Z-score normalization)
scaler = StandardScaler()
standardized_1_with_outliers = scaler.fit_transform(domain_1_with_outliers.reshape(-1, 1)).flatten()
standardized_2_with_outliers = scaler.fit_transform(domain_2_with_outliers.reshape(-1, 1)).flatten()

# Plot the original, normalized, and standardized data with 10 outliers
fig, ax = plt.subplots(2, 2, figsize=(12, 8))

# Original data with 10 outliers
ax[0, 0].hist(domain_1_with_outliers, bins=30, color='blue', alpha=0.7, label='Domain 1 (Original + 10 Outliers)')
ax[0, 0].hist(domain_2_with_outliers, bins=30, color='green', alpha=0.7, label='Domain 2 (Original + 10 Outliers)')
ax[0, 0].set_title("Original Data with 10 Outliers")
ax[0, 0].legend()

# Normalized data with 10 outliers
ax[0, 1].hist(normalized_1_with_outliers, bins=30, color='blue', alpha=0.7, label='Domain 1 (Normalized + 10 Outliers)')
ax[0, 1].hist(normalized_2_with_outliers, bins=30, color='green', alpha=0.7, label='Domain 2 (Normalized + 10 Outliers)')
ax[0, 1].set_title("Normalized Data with 10 Outliers")
ax[0, 1].legend()

# Standardized data with 10 outliers
ax[1, 0].hist(standardized_1_with_outliers, bins=30, color='blue', alpha=0.7, label='Domain 1 (Standardized + 10 Outliers)')
ax[1, 0].hist(standardized_2_with_outliers, bins=30, color='green', alpha=0.7, label='Domain 2 (Standardized + 10 Outliers)')
ax[1, 0].set_title("Standardized Data with 10 Outliers")
ax[1, 0].legend()

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Generate sample data
np.random.seed(0)
data = np.sin(np.linspace(0, 10, 100)) * 10 + np.random.normal(20, 5, 100)

# Standardization
scaler_standard = StandardScaler()
data_standardized = scaler_standard.fit_transform(data.reshape(-1, 1)).flatten()

# Normalization
scaler_minmax = MinMaxScaler()
data_normalized = scaler_minmax.fit_transform(data.reshape(-1, 1)).flatten()

# Plotting
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.title("Original vs Standardized vs Normalized Data")
plt.plot(data, label='Original', alpha=0.7)
plt.plot(data_standardized, label='Standardized', alpha=0.7)
plt.plot(data_normalized, label='Normalized', alpha=0.7)
plt.legend()
plt.ylabel('Value')
plt.xlabel('Index')

plt.subplot(1, 2, 2)
plt.title("Data Distribution")
plt.hist(data, bins=20, alpha=0.5, label='Original')
plt.hist(data_standardized, bins=20, alpha=0.5, label='Standardized')
plt.hist(data_normalized, bins=20, alpha=0.5, label='Normalized')
plt.legend()
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Print some statistics
print("Original Data - Mean: {:.2f}, Std: {:.2f}, Min: {:.2f}, Max: {:.2f}".format(
    np.mean(data), np.std(data), np.min(data), np.max(data)))
print("Standardized Data - Mean: {:.2f}, Std: {:.2f}, Min: {:.2f}, Max: {:.2f}".format(
    np.mean(data_standardized), np.std(data_standardized), np.min(data_standardized), np.max(data_standardized)))
print("Normalized Data - Mean: {:.2f}, Std: {:.2f}, Min: {:.2f}, Max: {:.2f}".format(
    np.mean(data_normalized), np.std(data_normalized), np.min(data_normalized), np.max(data_normalized)))

In [None]:
import timm


all_densenet_models = timm.list_models('*resnet*', pretrained=True)
display(all_densenet_models)

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

data = {
    'Color': ['Red', 'Blue', 'Green', 'Red', None, 'Blue', 'Red', None],
    'Size': [10, 15, None, 14, 13, 10, None, 12],
    'Price': [100, None, 150, 125, None, 110, 130, None]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

numerical_features = ['Size', 'Price']
categorical_features = ['Color']

# Imputer for nume
numerical_transformer = SimpleImputer(strategy='median')
# Imputer for cat
categorical_transformer = SimpleImputer(strategy='most_frequent', missing_values=None)

# apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Apply
df_transformed = preprocessor.fit_transform(df)

# Convert the transformed array back to a DataFrame
df_transformed = pd.DataFrame(df_transformed, columns=numerical_features + categorical_features)

print("\nDataFrame after imputation:")
print(df_transformed)

In [None]:
import pandas as pd
from scipy.stats import mode

# Sample data
data = {'A': [1, 2, 2, None, 2, 3, 4, 4, 5],
        'B': ['red', 'blue', None, 'blue', 'red', 'red', 'red', 'green', 'blue']}

df = pd.DataFrame(data)

# Function to apply mode imputation
def mode_imputation(df):
    for column in df.columns:
        # Calculate the mode of the column
        most_frequent = mode(df[column]).mode[0]
        # Replace missing values with the mode
        df[column].fillna(most_frequent, inplace=True)
    return df

# Apply mode imputation to the dataframe
imputed_df = mode_imputation(df)

print(imputed_df)


In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Sample dataset
data = {
    'Color': ['Red', 'Blue', 'Green', 'Red', None, 'Blue', 'Red', None]
}
df = pd.DataFrame(data)

# Display the original data
print("Original DataFrame:")
print(df)

# Initialize the SimpleImputer with strategy='most_frequent' for mode imputation
imputer = SimpleImputer(strategy='most_frequent')

# Perform mode imputation on the 'Color' column
df['Color'] = imputer.fit_transform(df[['Color']])

# Display the DataFrame after mode imputation
print("\nDataFrame after mode imputation:")
print(df)


In [None]:
aa = ['就寝', '起床']
for elem in aa:
    if elem in '起床就　寝aa':
        print(11)

In [None]:
from urllib.request import urlopen
from PIL import Image
import timm

img = Image.open(urlopen(
    'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
))

model = timm.create_model(
    'resnet18.a1_in1k',
    # "efficientnet_b0.ra_in1k",
    pretrained=True,
    features_only=True,
)
model = model.eval()

# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(**data_config, is_training=False)

output = model(transforms(img).unsqueeze(0))  # unsqueeze single image into batch of 1

for o in output:
    # print shape of each feature map in output
    # e.g.:
    #  torch.Size([1, 64, 112, 112])
    #  torch.Size([1, 64, 56, 56])
    #  torch.Size([1, 128, 28, 28])
    #  torch.Size([1, 256, 14, 14])
    #  torch.Size([1, 512, 7, 7])

    print(o.shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

# Generate data
x = np.arange(1, 101)  # Simple increasing sequence: 1 to 100
y_linear = x  # Linear relationship

# Generate a sine wave added to the linearly increasing sequence
y_sin_wave = np.sin(np.linspace(0, 50 * np.pi, 100)) + x  # Sine wave with increasing trend

# Plot the data
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(x, y_linear, 'o-', label='Linear (1 to 100)')
plt.title("Linear Data (1 to 100)")
plt.xlabel("x")
plt.ylabel("y")
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(x, y_sin_wave, 'o-', label='Sine Wave + Linear')
plt.title("Sine Wave + Linear Data")
plt.xlabel("x")
plt.ylabel("y")
plt.grid(True)

plt.tight_layout()
plt.show()

# Calculate Pearson and Spearman correlations
pearson_corr_linear, _ = pearsonr(x, y_linear)
spearman_corr_linear, _ = spearmanr(x, y_linear)

pearson_corr_sin_wave, _ = pearsonr(x, y_sin_wave)
spearman_corr_sin_wave, _ = spearmanr(x, y_sin_wave)

print("Linear Data (1 to 100):")
print(f"Pearson Correlation Coefficient: {pearson_corr_linear:.2f}")
print(f"Spearman's Rank Correlation: {spearman_corr_linear:.2f}")
print()

print("Sine Wave + Linear Data:")
print(f"Pearson Correlation Coefficient: {pearson_corr_sin_wave:.2f}")
print(f"Spearman's Rank Correlation: {spearman_corr_sin_wave:.2f}")


In [None]:
from scipy.stats import spearmanr

# Example data: x vs. y (non-linear, monotonic relationship)
x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([2, 4, 8, 16, 32, 64, 128, 256, 512, 1024])

# Plot the data
plt.scatter(x, y)
plt.title("Original Data")
plt.xlabel("x")
plt.ylabel("y")
plt.show()


# Calculate Correlation
correlation_coefficient, p_value = pearsonr(x, y)
spearman_corr, p_value = spearmanr(x, y)


print(f"Pearson Correlation Coefficient (log-transformed): {correlation_coefficient:.2f}")
print(f"P-value: {p_value:.2f}")

print(f"Spearman's Rank Correlation: {spearman_corr:.2f}")
print(f"P-value: {p_value:.2f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Example data: x vs. y (non-linear relationship)
x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([2, 4, 8, 16, 32, 64, 128, 256, 512, 1024])

# Original data plot
plt.scatter(x, y)
plt.title("Original Data")
plt.xlabel("x")
plt.ylabel("y")
plt.show()

# Apply a logarithmic transformation to y
log_y = np.log(y)

# Transformed data plot
plt.scatter(x, log_y)
plt.title("Log-Transformed Data")
plt.xlabel("x")
plt.ylabel("log(y)")
plt.show()

# Calculate Pearson Correlation on transformed data
correlation_coefficient, p_value = pearsonr(x, y)

print(f"Pearson Correlation Coefficient (log-transformed): {correlation_coefficient:.2f}")
print(f"P-value: {p_value:.2f}")


In [None]:
a = (1,2,3)

def funca():
    return a

e,b = funca()
print(e,b)

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

import optuna

In [None]:
import numpy as np
from scipy.stats import pearsonr

# Example data: hours studied vs. exam scores
hours_studied = np.array([1, 2, 3, 4, 3.5])
exam_scores = np.array([60, 70, 75, 85, 90])

# Calculate Pearson Correlation Coefficient
correlation_coefficient, p_value = pearsonr(hours_studied, exam_scores)

print(f"Pearson Correlation Coefficient: {correlation_coefficient:.2f}")
print(f"P-value: {p_value:.2f}")


In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Generate a sample of data
np.random.seed(42)
sample_data = np.random.normal(loc=0, scale=1, size=100)  # A sample from a normal distribution

# Perform the K-S test against a normal distribution
D, p_value = stats.kstest(sample_data, 'norm', args=(0, 1))

print(f"KS Statistic: {D}")
print(f"P-value: {p_value}")

# Interpretation
if p_value < 0.05:
    print("Reject the null hypothesis: The sample does not follow a normal distribution.")
else:
    print("Fail to reject the null hypothesis: The sample follows a normal distribution.")

# Plotting the empirical CDF vs the theoretical CDF
ecdf = np.sort(sample_data)
cdf = np.arange(1, len(ecdf) + 1) / len(ecdf)
plt.plot(ecdf, cdf, marker='.', linestyle='none', label='Empirical CDF')

# Theoretical CDF for the normal distribution
x = np.linspace(min(ecdf), max(ecdf), 100)
plt.plot(x, stats.norm.cdf(x, loc=0, scale=1), label='Theoretical CDF (Normal Dist)')

plt.legend()
plt.xlabel('Value')
plt.ylabel('CDF')
plt.title('Empirical CDF vs Theoretical CDF')
plt.show()


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize individual models
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = SVC(probability=True)

# Initialize Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', model1), ('dt', model2), ('svc', model3)],
    voting='soft',
    weights=[0.3, 0.3, 0.3])  # Use 'hard' for hard voting

# Train and predict
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

# Evaluate accuracy
display(y_test)
display(y_pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")



In [None]:
!free -h

In [None]:
root = Path('/kaggle/input/isic-2024-challenge')

train_path = '/root/Development/Kaggle/ISIC2024/data/raw/train-metadata.csv'
test_path = '/root/Development/Kaggle/ISIC2024/data/raw/test-metadata.csv'
subm_path = '/root/Development/Kaggle/ISIC2024/data/raw/sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',        # border_complexity       + lesion_shape_index
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',         # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',# tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
special_cols = ['count_per_patient']
feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + special_cols

In [None]:
def read_data(path):
    return (
        # pl.read_csv(path, n_rows=10000)
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        # .with_columns(
        #     ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        # )
        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        .to_pandas()
        .set_index(id_col)
    )

In [None]:
def preprocess(df_train, df_test):
    global cat_cols
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')

    for col in cat_cols:
        feature_cols.remove(col)

    feature_cols.extend(new_cat_cols)
    cat_cols = new_cat_cols
    
    return df_train, df_test

In [None]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

In [None]:
df_train = read_data(train_path)
new_columns = {}
for col in num_cols + new_num_cols:
    patient_mean = df_train.groupby('patient_id')[col].transform('mean')
    patient_std = df_train.groupby('patient_id')[col].transform('std')
    # Store the normalized column in the dictionary
    new_columns[f'{col}_patient_norm'] = (df_train[col] - patient_mean) / (patient_std + err)
df_train = pd.concat([df_train, pd.DataFrame(new_columns)], axis=1)


df_test = read_data(test_path)
new_columns = {}
for col in num_cols + new_num_cols:
    patient_mean_test = df_test.groupby('patient_id')[col].transform('mean')
    patient_std_test = df_test.groupby('patient_id')[col].transform('std')
    # Store the normalized column in the dictionary
    new_columns[f'{col}_patient_norm'] = (df_test[col] - patient_mean_test) / (patient_std_test + err)
df_test = pd.concat([df_test, pd.DataFrame(new_columns)], axis=1)

# .with_columns(
        #     ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        # )


df_subm = pd.read_csv(subm_path, index_col=id_col)

df_train, df_test = preprocess(df_train, df_test)

In [None]:
display(df_train)
display(df_test)