In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [2]:
train_backpack = pd.read_csv('./data/train.csv')
extra_backpack = pd.read_csv('./data/training_extra.csv')
test_backpack = pd.read_csv('./data/test.csv')

In [3]:
backpack = pd.concat([train_backpack, extra_backpack])
backpack.drop('id', axis=1, inplace=True)
test_backpack.drop('id', axis=1, inplace=True)

In [4]:
backpack.columns = backpack.columns.str.replace(" ", "_")
test_backpack.columns = test_backpack.columns.str.replace(" ", "_")

In [5]:
backpack.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop_Compartment,Waterproof,Style,Color,Weight_Capacity_(kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


# Treating Missing Value

In [6]:
backpack.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3994318 entries, 0 to 3694317
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Brand                 object 
 1   Material              object 
 2   Size                  object 
 3   Compartments          float64
 4   Laptop_Compartment    object 
 5   Waterproof            object 
 6   Style                 object 
 7   Color                 object 
 8   Weight_Capacity_(kg)  float64
 9   Price                 float64
dtypes: float64(3), object(7)
memory usage: 335.2+ MB


## Delete Rows with 3 Nan value and More

In [7]:
threshold = 3
mask = backpack.isna().sum(axis=1) >= threshold
filtered_backpack = backpack[mask]

In [8]:
clean_backpack1 = backpack[~mask]

## Remove Missing Value for Nominal Categories

In [9]:
nominal_col = ['Brand', 'Material', 'Style', 'Color']
clean_backpack2 = clean_backpack1.dropna(subset=nominal_col)

In [10]:
clean_backpack2.isna().sum()

Brand                       0
Material                    0
Size                    80598
Compartments                0
Laptop_Compartment      88337
Waterproof              87073
Style                       0
Color                       0
Weight_Capacity_(kg)      387
Price                       0
dtype: int64

## Missing Value Imputation

In [11]:
def get_labels_dict(data, col):
    labels = data[col].sort_values(ascending=False).unique().tolist()
    label_dic = {k:i for i, k in enumerate(labels)}
    if np.nan in label_dic.keys():
        label_dic[np.nan] = np.nan
    return label_dic

In [12]:
def reverse_dic(dic):
    return {v: k for k, v in dic.items()}

In [13]:
from sklearn.impute import KNNImputer

def column_imputation(data, to_impute_obj_col, to_impute_num_col):
    before_backpack = data.copy()
    
    # Create dictionaries dynamically for obj col
    label_dicts = {col: get_labels_dict(before_backpack, col) for col in to_impute_obj_col}
    
    # Map ordinal categories using generated dictionaries
    for col in to_impute_obj_col:
        before_backpack[col] = before_backpack[col].map(label_dicts[col])
            
    # Perform KNN Imputation
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    imputed_backpack = imputer.fit_transform(before_backpack[to_impute_obj_col + to_impute_num_col])
    
    # Convert back to DataFrame
    imputed_df = pd.DataFrame(imputed_backpack, columns=to_impute_obj_col + to_impute_num_col, index=before_backpack.index)

    # Map back to original categorical values
    for col in to_impute_obj_col:
        imputed_df[col] = imputed_df[col].round().astype(int)
        imputed_df[col] = imputed_df[col].map(reverse_dic(label_dicts[col]))

    imputed_backpack = data.copy()
    imputed_backpack[to_impute_obj_col + to_impute_num_col] = round(imputed_df)
    
    return imputed_backpack

In [14]:
to_impute_obj_col = ['Size', 'Laptop_Compartment', 'Waterproof']
to_impute_num_col = ['Weight_Capacity_(kg)']

In [15]:
final_backpack = clean_backpack2.dropna().copy()

### Test Dataset Imputation

In [16]:
for col in test_backpack.columns.tolist():
    mod_value = final_backpack[col].mode()[0]
    test_backpack[col] = test_backpack[col].fillna(mod_value)

In [17]:
print(test_backpack.isnull().sum())

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop_Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight_Capacity_(kg)    0
dtype: int64


In [18]:
# imputed_test_backpack = column_imputation(test_backpack, to_impute_obj_col, to_impute_num_col)
imputed_test_backpack = test_backpack.copy()

In [19]:
imputed_test_backpack.isnull().sum()

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop_Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight_Capacity_(kg)    0
dtype: int64

## Add Total Revenue for each Brand in 2023

In [20]:
total_revenu = {
  "Adidas": 23.19,
  "Nike": 51.54,
  "Puma": 8.88,
  "Under Armour": 5.9,
  "Jansport": 10.5
}

In [21]:
# final_backpack['total_revenue_2023'] = final_backpack['Brand'].map(total_revenu)
# imputed_test_backpack['total_revenue_2023'] = imputed_test_backpack['Brand'].map(total_revenu)

In [22]:
# final_backpack['compartments_efficiency'] = final_backpack['Compartments'] / final_backpack['Weight_Capacity_(kg)']
# imputed_test_backpack['compartments_efficiency'] = imputed_test_backpack['Compartments'] / imputed_test_backpack['Weight_Capacity_(kg)']

In [23]:
# final_backpack["Has_Laptop_Compartment_Waterproof"] = ((final_backpack["Laptop_Compartment"] == "Yes") & (final_backpack["Waterproof"] == "Yes")).astype(bool)
# imputed_test_backpack["Has_Laptop_Compartment_Waterproof"] = ((imputed_test_backpack["Laptop_Compartment"] == "Yes") & (imputed_test_backpack["Waterproof"] == "Yes")).astype(bool)

In [24]:
material_durability = {"Canvas": 1, "Nylon": 2, "Polyester": 3, "Leather": 4}
final_backpack["Material_Durability_Score"] = final_backpack["Material"].map(material_durability)
imputed_test_backpack["Material_Durability_Score"] = imputed_test_backpack["Material"].map(material_durability)

In [25]:
# weight_bins = [4, 12, 18, 24, 30]
# weight_labels = ["Light", "Medium", "Heavy", "Very Heavy"]
# final_backpack["Weight_Category"] = pd.cut(final_backpack["Weight_Capacity_(kg)"], bins=weight_bins, labels=weight_labels)
# imputed_test_backpack["Weight_Category"] = pd.cut(imputed_test_backpack["Weight_Capacity_(kg)"], bins=weight_bins, labels=weight_labels)

# Feature Encoding

In [56]:
X = final_backpack.drop(['Price', 'Material'], axis=1)
y = final_backpack['Price']

In [27]:
X.head()

Unnamed: 0,Size,Compartments,Laptop_Compartment,Waterproof,Weight_Capacity_(kg),Material_Durability_Score
0,Medium,7.0,Yes,No,11.611723,4
1,Small,10.0,Yes,Yes,27.078537,1
2,Small,2.0,Yes,No,16.64376,4
3,Small,8.0,Yes,No,12.93722,2
4,Medium,1.0,Yes,Yes,17.749338,1


In [28]:
ordinal_categories = {
    'Size': ['Small', 'Medium', 'Large'],
    # 'Weight_Category': ["Light", "Medium", "Heavy", 'Very Heavy'],
}

In [29]:
# num_col = X.select_dtypes(include='number').columns.tolist()
num_col = X.columns.tolist()

In [57]:
to_onehot_col = ['Laptop_Compartment', 'Waterproof']
# 'Has_Laptop_Compartment_Waterproof'
to_target_encode = ['Brand', 'Color', 'Style']
# ['Weight_Category', 'Color', 'Style', 'Material', 'Brand']

In [58]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, TargetEncoder
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('onehot encoding', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), to_onehot_col),
    ('target encoding', TargetEncoder(smooth=2), to_target_encode),
    ('ordinal', OrdinalEncoder(
            categories=[ordinal_categories[col] for col in ordinal_categories],
        ), list(ordinal_categories.keys())),
    # ('minmaxscaler', MinMaxScaler(), num_col)
]
    , remainder="passthrough", verbose_feature_names_out=False
).set_output(transform="pandas")

X_transformed = transformer.fit_transform(X, y)

In [59]:
num_cols = X_transformed.columns.tolist()

In [60]:
def data_scaling(data): 
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data

In [61]:
scaled_data = data_scaling(X_transformed)

# Regression

In [63]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import  SVR
import lightgbm as lgb
from xgboost import XGBRegressor
import datetime as dt
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_data, y, test_size=0.20, random_state=42)
lr = LinearRegression()
# elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)

In [64]:
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test)

lgb_param = {
    'objective': 'regression',  # For regression tasks
    'metric': 'rmse',          # Use RMSE as the evaluation metric
    'boosting_type': 'gbdt',
    'learning_rate':0.1, 
    'max_depth': 10,
}

In [65]:
lr.fit(X_train, y_train)
# elastic_net.fit(X_train, y_train)

In [66]:
start = dt.datetime.now()
clf = lgb.train(lgb_param, dtrain, 200)
end = dt.datetime.now()
elapsed = end - start

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 2630802, number of used features: 9
[LightGBM] [Info] Start training from score 81.478594


In [67]:
print(elapsed)

0:00:22.846999


In [68]:
model = XGBRegressor(
    # max_depth=5,
    colsample_bytree=0.5,
    subsample=0.80,
    n_estimators=200,
    learning_rate=0.1,
    min_child_weight=80,
    # enable_categorical=True,
    early_stopping_rounds=600
)

In [69]:
start = dt.datetime.now()
model.fit(X_train, y_train, 
         eval_set=[(X_test, y_test)], 
         verbose=True
    )
end = dt.datetime.now()
elapsed2 = end - start
print(elapsed2)

[0]	validation_0-rmse:38.80835
[1]	validation_0-rmse:38.80651
[2]	validation_0-rmse:38.80447
[3]	validation_0-rmse:38.80286
[4]	validation_0-rmse:38.79925
[5]	validation_0-rmse:38.79606
[6]	validation_0-rmse:38.79546
[7]	validation_0-rmse:38.79472
[8]	validation_0-rmse:38.79240
[9]	validation_0-rmse:38.79210
[10]	validation_0-rmse:38.78972
[11]	validation_0-rmse:38.78924
[12]	validation_0-rmse:38.78879
[13]	validation_0-rmse:38.78844
[14]	validation_0-rmse:38.78792
[15]	validation_0-rmse:38.78638
[16]	validation_0-rmse:38.78622
[17]	validation_0-rmse:38.78568
[18]	validation_0-rmse:38.78414
[19]	validation_0-rmse:38.78380
[20]	validation_0-rmse:38.78234
[21]	validation_0-rmse:38.78210
[22]	validation_0-rmse:38.78085
[23]	validation_0-rmse:38.78070
[24]	validation_0-rmse:38.77962
[25]	validation_0-rmse:38.77946
[26]	validation_0-rmse:38.77887
[27]	validation_0-rmse:38.77873
[28]	validation_0-rmse:38.77863
[29]	validation_0-rmse:38.77764
[30]	validation_0-rmse:38.77698
[31]	validation_0-

In [70]:
y_pred = lr.predict(X_test)
y_pred2 = clf.predict(X_test) 
y_pred3 = model.predict(X_test)

In [71]:
from sklearn.metrics import root_mean_squared_error

rmse1 = root_mean_squared_error(y_test, y_pred)
rmse2 = root_mean_squared_error(y_test, y_pred2)
rmse3 = root_mean_squared_error(y_test, y_pred3)

In [72]:
print('Root Mean Squared Error : ', rmse1)
print('Root Mean Squared Error : ', rmse2)
print('Root Mean Squared Error : ', rmse3)

Root Mean Squared Error :  38.79293996235504
Root Mean Squared Error :  38.77080937028523
Root Mean Squared Error :  38.77189941504162


# Submission

In [73]:
X_test_transformed = transformer.transform(imputed_test_backpack)
scaled_test_data = data_scaling(X_test_transformed)

In [74]:
sub_pred = clf.predict(scaled_test_data)
sub2 = lr.predict(scaled_test_data)
sub3 = model.predict(scaled_test_data)

In [75]:
nn=np.round(sub_pred, decimals=2)

In [76]:
sub = pd.read_csv('./data/sample_submission.csv')
sub['Price'] = nn
sub.to_csv('submission28.csv', index=False)

In [50]:
np.random.permutation(len(X_test))

array([329138,  43565, 567365, ..., 221093, 366230,  20801])