In [2]:
import numpy as np
import os
import optuna
from optuna.integration import TFKerasPruningCallback
from optuna.pruners import HyperbandPruner
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split,StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score


df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
X_train = df_train.drop(columns=['Price'])
y_train = df_train['Price']
print(X_train)
print(y_train)
X_test= df_test
print(df_train.dtypes)
print(df_train.isnull().sum())

X_train['Compartments'].fillna(X_train['Compartments'].median(), inplace=True)
X_train['Weight Capacity (kg)'].fillna(X_train['Weight Capacity (kg)'].median(), inplace=True)
def impute_categorical_uniform(df, columns):
    """
    Imputes missing values in a categorical column using a uniform distribution.
    
    Args:
    df (pd.DataFrame): Input dataframe.
    column (str): Column name to impute.

    Returns:
    pd.DataFrame: Dataframe with imputed values.
    """
    df_copy = df.copy()
    for column in columns:
        # Find unique categories (excluding NaN)
        
        # Find unique categories (excluding NaN)
        #unique_categories = df[column].dropna().unique()
    
    # Count missing values
        num_missing = df[column].isna().sum()
    
    # If there are missing values, sample from the unique categories
        if num_missing > 0:


                # Sample based on observed category probabilities
            category_probs = df_copy[column].value_counts(normalize=True)
            imputed_values = np.random.choice(category_probs.index, size=num_missing, p=category_probs.values)
            df_copy.loc[df_copy[column].isna(), column] = imputed_values
    
    return df_copy




X_train=impute_categorical_uniform(X_train, columns=['Brand', 'Material', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Size'])



def detect_outliers_iqr(df, columns):
    """
    Detects outliers using the IQR method.
    
    Args:
    df (pd.DataFrame): Input dataframe.
    columns (list): List of numerical columns to check.

    Returns:
    pd.DataFrame: DataFrame with outlier information.
    """
    df_outliers = df.copy()
    
    for col in columns:
        Q1 = df_outliers[col].quantile(0.25)
        Q3 = df_outliers[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Detect outliers
        outliers = df_outliers[(df_outliers[col] < lower_bound) | (df_outliers[col] > upper_bound)]
        print(f"{col}: {len(outliers)} outliers detected.")
    
    return df_outliers

# Detect outliers in numerical columns
numerical_cols = ['Compartments', 'Weight Capacity (kg)']

detect_outliers_iqr(X_train, numerical_cols)

detect_outliers_iqr(pd.DataFrame(y_train, columns=['Price']), ['Price'])
# Initialize Isolation Forest
# iso_forest = IsolationForest(contamination=0.05, random_state=42)

# # Fit on training data (excluding target variable)
# outliers = iso_forest.fit_predict(X_train)  # Returns 1 (normal) or -1 (outlier)

# # Remove detected outliers
# X_train_filtered = X_train[outliers == 1]
# y_train_filtered = y_train[outliers == 1]







scaler = StandardScaler()
columns_to_standardize = ['Compartments', 'Weight Capacity (kg)'] 
X_train[columns_to_standardize] = scaler.fit_transform(X_train[columns_to_standardize])
X_train.drop('Color',axis=1,inplace=True)

encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = encoder.fit_transform(X_train[['Brand', 'Material', 'Laptop Compartment', 'Waterproof', 'Style']])

# Convert to a DataFrame
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out())

# Drop original categorical columns from X_train
X_train = X_train.drop(columns=['Brand', 'Material', 'Laptop Compartment', 'Waterproof', 'Style'])
# Merge the transformed data back
X_train = pd.concat([X_train, encoded_df], axis=1)
encoder2 = LabelEncoder()
X_train['Size'] = encoder2.fit_transform(X_train['Size'])
print(X_train)


  from .autonotebook import tqdm as notebook_tqdm


            id         Brand Material    Size  Compartments  \
0            0      Jansport  Leather  Medium           7.0   
1            1      Jansport   Canvas   Small          10.0   
2            2  Under Armour  Leather   Small           2.0   
3            3          Nike    Nylon   Small           8.0   
4            4        Adidas   Canvas  Medium           1.0   
...        ...           ...      ...     ...           ...   
299995  299995        Adidas  Leather   Small           9.0   
299996  299996      Jansport  Leather   Large           6.0   
299997  299997          Puma   Canvas   Large           9.0   
299998  299998        Adidas    Nylon   Small           1.0   
299999  299999  Under Armour   Canvas   Small           2.0   

       Laptop Compartment Waterproof      Style  Color  Weight Capacity (kg)  
0                     Yes         No       Tote  Black             11.611723  
1                     Yes        Yes  Messenger  Green             27.078537  
2     

In [42]:

from tensorflow.keras import mixed_precision
from tensorflow.keras import regularizers

#mixed_precision.set_global_policy('mixed_float16')
print(tf.test.is_built_with_cuda())  # Should return True
print(tf.test.is_gpu_available())
tf.config.optimizer.set_jit(False)


tf.keras.mixed_precision.set_global_policy('float32')
gpus = tf.config.list_physical_devices('GPU')
AUTOTUNE = tf.data.AUTOTUNE

if gpus:
    try:
        # Set TensorFlow to use only the first GPU
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("GPU enabled:", gpus[0])
    except RuntimeError as e:
        print(e)

# Define K for K-Fold Cross-Validation
K = 5  # Number of folds
kf = KFold(n_splits=K, shuffle=True, random_state=42)  # Ensure randomness

# Store RMSE for each fold
val_rmse_scores = []

# Loop through K-folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):  # Assuming X_train, y_train are defined
    print(f"\n Training Fold {fold+1}/{K}...")
    
    X_train_sub, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_sub, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train_sub, y_train_sub))
    train_dataset = train_dataset.batch(246).prefetch(tf.data.AUTOTUNE)
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(256).prefetch(tf.data.AUTOTUNE)
   


    model = keras.Sequential([
         keras.layers.Input(shape=(X_train_sub.shape[1],)),
        #  keras.layers.Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.005)),
        #  keras.layers.Dropout(0.1),
        #  keras.layers.Dense(256,activation='relu',kernel_regularizer=regularizers.l2(0.005)),
        #  keras.layers.Dropout(0.1),
          keras.layers.Dense(128,activation='relu',kernel_regularizer=regularizers.l2(0.01)),
        #  keras.layers.BatchNormalization(),
          #keras.layers.Dropout(0.1),
         keras.layers.Dense(64, activation='relu',kernel_regularizer=regularizers.l2(0.01)),
         #keras.layers.BatchNormalization(),
         #keras.layers.Dropout(0.1),
         keras.layers.Dense(32, activation='relu',kernel_regularizer=regularizers.l2(0.01)),
         #keras.layers.BatchNormalization(), 
         #keras.layers.Dropout(0.3),
         keras.layers.Dense(1, activation='linear')
                                                      ])  # 

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='mse', metrics=[keras.metrics.RootMeanSquaredError()])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# pruning_callback = TFKerasPruningCallback(trial, "val_root_mean_squared_error")

    history = model.fit(train_dataset, 
                        epochs=100, 
                        verbose=1, validation_data=val_dataset)

    # Return validation RMSE as the score to minimize
    val_rmse = history.history['val_root_mean_squared_error'][-1]  # Last epoch RMSE
    val_rmse_scores.append(val_rmse)  # Store RMSE score
    print(f"✅ Fold {fold+1} RMSE: {val_rmse:.4f}")

    


False
True
GPU enabled: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

 Training Fold 1/5...


2025-02-11 13:09:18.119401: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-11 13:09:18.119513: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/100
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 26ms/step - loss: 7658.2139 - root_mean_squared_error: 82.0833 - val_loss: 3454.4626 - val_root_mean_squared_error: 58.7685
Epoch 2/100
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - loss: 2200.7046 - root_mean_squared_error: 46.1315 - val_loss: 4063.4058 - val_root_mean_squared_error: 63.7408
Epoch 3/100
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - loss: 2085.4443 - root_mean_squared_error: 45.1327 - val_loss: 3957.6792 - val_root_mean_squared_error: 62.9063
Epoch 4/100
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - loss: 2305.1938 - root_mean_squared_error: 47.1961 - val_loss: 3629.2712 - val_root_mean_squared_error: 60.2396
Epoch 5/100
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - loss: 2141.7107 - root_mean_squared_error: 45.7055 - val_loss: 4442.4336 - val_root_mean_squared_

KeyboardInterrupt: 