In [271]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [272]:
train_backpack = pd.read_csv('./data/train.csv')
extra_backpack = pd.read_csv('./data/training_extra.csv')
test_backpack = pd.read_csv('./data/test.csv')

In [273]:
backpack = pd.concat([train_backpack, extra_backpack])
backpack.drop('id', axis=1, inplace=True)
test_backpack.drop('id', axis=1, inplace=True)

In [274]:
backpack.columns = backpack.columns.str.replace(" ", "_")
test_backpack.columns = test_backpack.columns.str.replace(" ", "_")

In [275]:
backpack.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop_Compartment,Waterproof,Style,Color,Weight_Capacity_(kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


# Treating Missing Value

In [276]:
backpack.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3994318 entries, 0 to 3694317
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Brand                 object 
 1   Material              object 
 2   Size                  object 
 3   Compartments          float64
 4   Laptop_Compartment    object 
 5   Waterproof            object 
 6   Style                 object 
 7   Color                 object 
 8   Weight_Capacity_(kg)  float64
 9   Price                 float64
dtypes: float64(3), object(7)
memory usage: 335.2+ MB


## Delete Rows with 3 Nan value and More

In [277]:
threshold = 3
mask = backpack.isna().sum(axis=1) >= threshold
filtered_backpack = backpack[mask]

In [278]:
clean_backpack1 = backpack[~mask]

## Remove Missing Value for Nominal Categories

In [279]:
nominal_col = ['Brand', 'Material', 'Style', 'Color']
clean_backpack2 = clean_backpack1.dropna(subset=nominal_col)

In [280]:
clean_backpack2.isna().sum()

Brand                       0
Material                    0
Size                    80598
Compartments                0
Laptop_Compartment      88337
Waterproof              87073
Style                       0
Color                       0
Weight_Capacity_(kg)      387
Price                       0
dtype: int64

## Missing Value Imputation

In [281]:
def get_labels_dict(data, col):
    labels = data[col].sort_values(ascending=False).unique().tolist()
    label_dic = {k:i for i, k in enumerate(labels)}
    if np.nan in label_dic.keys():
        label_dic[np.nan] = np.nan
    return label_dic

In [282]:
def reverse_dic(dic):
    return {v: k for k, v in dic.items()}

In [283]:
from sklearn.impute import KNNImputer

def column_imputation(data, to_impute_obj_col, to_impute_num_col):
    before_backpack = data.copy()
    
    # Create dictionaries dynamically for obj col
    label_dicts = {col: get_labels_dict(before_backpack, col) for col in to_impute_obj_col}
    
    # Map ordinal categories using generated dictionaries
    for col in to_impute_obj_col:
        before_backpack[col] = before_backpack[col].map(label_dicts[col])
            
    # Perform KNN Imputation
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    imputed_backpack = imputer.fit_transform(before_backpack[to_impute_obj_col + to_impute_num_col])
    
    # Convert back to DataFrame
    imputed_df = pd.DataFrame(imputed_backpack, columns=to_impute_obj_col + to_impute_num_col, index=before_backpack.index)

    # Map back to original categorical values
    for col in to_impute_obj_col:
        imputed_df[col] = imputed_df[col].round().astype(int)
        imputed_df[col] = imputed_df[col].map(reverse_dic(label_dicts[col]))

    imputed_backpack = data.copy()
    imputed_backpack[to_impute_obj_col + to_impute_num_col] = round(imputed_df)
    
    return imputed_backpack

In [284]:
to_impute_obj_col = ['Size', 'Laptop_Compartment', 'Waterproof']
to_impute_num_col = ['Weight_Capacity_(kg)']

In [285]:
final_backpack = clean_backpack2.dropna().copy()

### Test Dataset Imputation

In [286]:
for col in test_backpack.columns.tolist():
    mod_value = final_backpack[col].mode()[0]
    test_backpack[col] = test_backpack[col].fillna(mod_value)

In [287]:
print(test_backpack.isnull().sum())

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop_Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight_Capacity_(kg)    0
dtype: int64


In [288]:
# imputed_test_backpack = column_imputation(test_backpack, to_impute_obj_col, to_impute_num_col)
imputed_test_backpack = test_backpack.copy()

In [289]:
imputed_test_backpack.isnull().sum()

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop_Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight_Capacity_(kg)    0
dtype: int64

## Add Total Revenue for each Brand in 2023

In [290]:
total_revenu = {
  "Adidas": 23.19,
  "Nike": 51.54,
  "Puma": 8.88,
  "Under Armour": 5.9,
  "Jansport": 10.5
}

In [291]:
final_backpack['total_revenue_2023'] = final_backpack['Brand'].map(total_revenu)
imputed_test_backpack['total_revenue_2023'] = imputed_test_backpack['Brand'].map(total_revenu)

In [292]:
# final_backpack['compartments_efficiency'] = final_backpack['Compartments'] / final_backpack['Weight_Capacity_(kg)']
# imputed_test_backpack['compartments_efficiency'] = imputed_test_backpack['Compartments'] / imputed_test_backpack['Weight_Capacity_(kg)']

In [293]:
final_backpack["Has_Laptop_Compartment_Waterproof"] = ((final_backpack["Laptop_Compartment"] == "Yes") & (final_backpack["Waterproof"] == "Yes")).astype(bool)
imputed_test_backpack["Has_Laptop_Compartment_Waterproof"] = ((imputed_test_backpack["Laptop_Compartment"] == "Yes") & (imputed_test_backpack["Waterproof"] == "Yes")).astype(bool)

In [294]:
material_durability = {"Canvas": 1, "Nylon": 2, "Polyester": 3, "Leather": 4}
final_backpack["Material_Durability_Score"] = final_backpack["Material"].map(material_durability)
imputed_test_backpack["Material_Durability_Score"] = imputed_test_backpack["Material"].map(material_durability)

In [295]:
weight_bins = [4, 12, 18, 24, 30]
weight_labels = ["Light", "Medium", "Heavy", "Very Heavy"]
final_backpack["Weight_Category"] = pd.cut(final_backpack["Weight_Capacity_(kg)"], bins=weight_bins, labels=weight_labels)
imputed_test_backpack["Weight_Category"] = pd.cut(imputed_test_backpack["Weight_Capacity_(kg)"], bins=weight_bins, labels=weight_labels)

In [296]:
mean_compartments_price = final_backpack.groupby('Compartments')['Price'].mean().to_dict()
final_backpack['mean_compartments_price'] = final_backpack['Compartments'].map(mean_compartments_price)
imputed_test_backpack['mean_compartments_price'] = imputed_test_backpack['Compartments'].map(mean_compartments_price)

In [297]:
mean_style_price = final_backpack.groupby('Style')['Price'].mean().to_dict()
final_backpack['mean_style_price'] = final_backpack['Style'].map(mean_style_price)
imputed_test_backpack['mean_style_price'] = imputed_test_backpack['Style'].map(mean_style_price)

# Feature Encoding

In [299]:
X = final_backpack.drop(['Price', 'Material', 'Weight_Category'], axis=1)
# 'Material', 'Color', 'Style', 'Compartments'
y = final_backpack['Price']

In [None]:
X.head()

In [300]:
ordinal_categories = {
    'Size': ['Small', 'Medium', 'Large'],
    # 'Weight_Category': ["Light", "Medium", "Heavy", 'Very Heavy'],
}

In [301]:
# num_col = X.select_dtypes(include='number').columns.tolist()
num_col = X.columns.tolist()

In [302]:
to_onehot_col = ['Laptop_Compartment', 'Waterproof']
# 'Has_Laptop_Compartment_Waterproof'
to_target_encode = ['Brand',  'Color', 'Style']
# ['Weight_Category', 'Color', 'Style', 'Material', 'Brand']

In [303]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, TargetEncoder
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('onehot encoding', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), to_onehot_col),
    ('target encoding', TargetEncoder(smooth=2), to_target_encode),
    ('ordinal', OrdinalEncoder(
            categories=[ordinal_categories[col] for col in ordinal_categories],
        ), list(ordinal_categories.keys())),
    # ('minmaxscaler', MinMaxScaler(), num_col)
]
    , remainder="passthrough", verbose_feature_names_out=False
).set_output(transform="pandas")

X_transformed = transformer.fit_transform(X, y)

In [304]:
num_cols = X_transformed.columns.tolist()

In [None]:
# def data_scaling(data): 
#     scaler = MinMaxScaler()
#     scaled_data = scaler.fit_transform(data)
#     return scaled_data

In [None]:
# scaled_data = data_scaling(X_transformed)

# Regression

In [305]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
import datetime as dt
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.20, random_state=42)
lr = LinearRegression()
neigh = KNeighborsRegressor(n_neighbors=1)
# elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)

In [306]:
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test)

lgb_param = {
    'objective': 'regression',  # For regression tasks
    'metric': 'rmse',          # Use RMSE as the evaluation metric
    'boosting_type': 'gbdt',
    'learning_rate':0.2, 
    'max_depth': 16,
}

In [307]:
lr.fit(X_train, y_train)
# neigh.fit(X_train, y_train)
# elastic_net.fit(X_train, y_train)

In [308]:
start = dt.datetime.now()
clf = lgb.train(lgb_param, dtrain, 400)
end = dt.datetime.now()
elapsed = end - start

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 363
[LightGBM] [Info] Number of data points in the train set: 2630802, number of used features: 11
[LightGBM] [Info] Start training from score 81.478594


In [309]:
print(elapsed)

0:00:40.477723


In [319]:
model = XGBRegressor(
    max_depth=6,
    colsample_bytree=0.5,
    subsample=0.80,
    n_estimators=1000,
    learning_rate=0.013,
    min_child_weight=80,
    enable_categorical=True,
    early_stopping_rounds=800
)

In [320]:
start = dt.datetime.now()
model.fit(X_train, y_train, 
         eval_set=[(X_test, y_test)], 
         verbose=True
    )
end = dt.datetime.now()
elapsed2 = end - start
print(elapsed2)

[0]	validation_0-rmse:38.80763
[1]	validation_0-rmse:38.80684
[2]	validation_0-rmse:38.80511
[3]	validation_0-rmse:38.80454
[4]	validation_0-rmse:38.80404
[5]	validation_0-rmse:38.80341
[6]	validation_0-rmse:38.80193
[7]	validation_0-rmse:38.80042
[8]	validation_0-rmse:38.79916
[9]	validation_0-rmse:38.79839
[10]	validation_0-rmse:38.79788
[11]	validation_0-rmse:38.79684
[12]	validation_0-rmse:38.79587
[13]	validation_0-rmse:38.79527
[14]	validation_0-rmse:38.79415
[15]	validation_0-rmse:38.79328
[16]	validation_0-rmse:38.79305
[17]	validation_0-rmse:38.79270
[18]	validation_0-rmse:38.79176
[19]	validation_0-rmse:38.79161
[20]	validation_0-rmse:38.79088
[21]	validation_0-rmse:38.79069
[22]	validation_0-rmse:38.79028
[23]	validation_0-rmse:38.78945
[24]	validation_0-rmse:38.78934
[25]	validation_0-rmse:38.78864
[26]	validation_0-rmse:38.78837
[27]	validation_0-rmse:38.78804
[28]	validation_0-rmse:38.78727
[29]	validation_0-rmse:38.78717
[30]	validation_0-rmse:38.78712
[31]	validation_0-

In [312]:
y_pred = lr.predict(X_test)
y_pred2 = clf.predict(X_test) 
y_pred3 = model.predict(X_test)
# y_pred4 = neigh.predict(X_test)

In [313]:
from sklearn.metrics import root_mean_squared_error

rmse1 = root_mean_squared_error(y_test, y_pred)
rmse2 = root_mean_squared_error(y_test, y_pred2)
rmse3 = root_mean_squared_error(y_test, y_pred3)
# rmse4 = root_mean_squared_error(y_test, y_pred4)

In [314]:
print('Root Mean Squared Error : ', rmse1)
print('Root Mean Squared Error : ', rmse2)
print('Root Mean Squared Error : ', rmse3)
# print('Root Mean Squared Error : ', rmse4)

Root Mean Squared Error :  38.79192534343927
Root Mean Squared Error :  38.814119282694755
Root Mean Squared Error :  38.77613707552811


In [None]:
/43''

# Submission

In [315]:
X_test_transformed = transformer.transform(imputed_test_backpack)
# scaled_test_data = data_scaling(X_test_transformed)

In [316]:
sub_pred = clf.predict(X_test_transformed)
sub2 = lr.predict(X_test_transformed)
sub3 = model.predict(X_test_transformed)

In [317]:
nn=np.round(sub3, decimals=3)

In [318]:
sub = pd.read_csv('./data/sample_submission.csv')
sub['Price'] = nn
sub.to_csv('submission32.csv', index=False)

In [None]:
np.random.permutation(len(X_test))