In [199]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [200]:
train_backpack = pd.read_csv('./data/train.csv')
extra_backpack = pd.read_csv('./data/training_extra.csv')
test_backpack = pd.read_csv('./data/test.csv')

In [201]:
backpack = pd.concat([train_backpack, extra_backpack])
backpack.drop('id', axis=1, inplace=True)
test_backpack.drop('id', axis=1, inplace=True)

In [203]:
backpack.columns = backpack.columns.str.replace(" ", "_")
test_backpack.columns = test_backpack.columns.str.replace(" ", "_")

In [204]:
backpack.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop_Compartment,Waterproof,Style,Color,Weight_Capacity_(kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


# Treating Missing Value

In [205]:
backpack.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3994318 entries, 0 to 3694317
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Brand                 object 
 1   Material              object 
 2   Size                  object 
 3   Compartments          float64
 4   Laptop_Compartment    object 
 5   Waterproof            object 
 6   Style                 object 
 7   Color                 object 
 8   Weight_Capacity_(kg)  float64
 9   Price                 float64
dtypes: float64(3), object(7)
memory usage: 335.2+ MB


## Delete Rows with 3 Nan value and More

In [206]:
threshold = 3
mask = backpack.isna().sum(axis=1) >= threshold
filtered_backpack = backpack[mask]

In [207]:
clean_backpack1 = backpack[~mask]

## Remove Missing Value for Nominal Categories

In [208]:
nominal_col = ['Brand', 'Material', 'Style', 'Color']
clean_backpack2 = clean_backpack1.dropna(subset=nominal_col)

In [209]:
clean_backpack2.isna().sum()

Brand                       0
Material                    0
Size                    80598
Compartments                0
Laptop_Compartment      88337
Waterproof              87073
Style                       0
Color                       0
Weight_Capacity_(kg)      387
Price                       0
dtype: int64

## Missing Value Imputation

In [210]:
def get_labels_dict(data, col):
    labels = data[col].sort_values(ascending=False).unique().tolist()
    label_dic = {k:i for i, k in enumerate(labels)}
    if np.nan in label_dic.keys():
        label_dic[np.nan] = np.nan
    return label_dic

In [211]:
def reverse_dic(dic):
    return {v: k for k, v in dic.items()}

In [212]:
from sklearn.impute import KNNImputer

def column_imputation(data, to_impute_obj_col, to_impute_num_col):
    before_backpack = data.copy()
    
    # Create dictionaries dynamically for obj col
    label_dicts = {col: get_labels_dict(before_backpack, col) for col in to_impute_obj_col}
    
    # Map ordinal categories using generated dictionaries
    for col in to_impute_obj_col:
        before_backpack[col] = before_backpack[col].map(label_dicts[col])
            
    # Perform KNN Imputation
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    imputed_backpack = imputer.fit_transform(before_backpack[to_impute_obj_col + to_impute_num_col])
    
    # Convert back to DataFrame
    imputed_df = pd.DataFrame(imputed_backpack, columns=to_impute_obj_col + to_impute_num_col, index=before_backpack.index)

    # Map back to original categorical values
    for col in to_impute_obj_col:
        imputed_df[col] = imputed_df[col].round().astype(int)
        imputed_df[col] = imputed_df[col].map(reverse_dic(label_dicts[col]))

    imputed_backpack = data.copy()
    imputed_backpack[to_impute_obj_col + to_impute_num_col] = round(imputed_df)
    
    return imputed_backpack

In [213]:
to_impute_obj_col = ['Size', 'Laptop_Compartment', 'Waterproof']
to_impute_num_col = ['Weight_Capacity_(kg)']

### Test Dataset Imputation

In [214]:
for col in nominal_col:
    mod_value = test_backpack[col].mode()[0]
    test_backpack[col] = test_backpack[col].fillna(mod_value)

In [216]:
print(test_backpack.isnull().sum())

Brand                      0
Material                   0
Size                    4381
Compartments               0
Laptop_Compartment      4962
Waterproof              4811
Style                      0
Color                      0
Weight_Capacity_(kg)      77
dtype: int64


In [217]:
imputed_test_backpack = column_imputation(test_backpack, to_impute_obj_col, to_impute_num_col)

In [218]:
imputed_test_backpack.isnull().sum()

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop_Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight_Capacity_(kg)    0
dtype: int64

## Add Total Revenue for each Brand in 2023

In [219]:
total_revenu = {
  "Adidas": 23.19,
  "Nike": 51.54,
  "Puma": 8.88,
  "Under Armour": 5.9,
  "Jansport": 10.5
}

In [242]:
final_backpack = clean_backpack2.dropna().copy()
final_backpack['total_revenue_2023'] = final_backpack['Brand'].map(total_revenu)
imputed_test_backpack['total_revenue_2023'] = imputed_test_backpack['Brand'].map(total_revenu)

In [221]:
final_backpack.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop_Compartment,Waterproof,Style,Color,Weight_Capacity_(kg),Price,total_revenue_2023
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875,10.5
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056,10.5
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732,5.9
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793,51.54
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312,23.19


# Feature Encoding

In [254]:
X = final_backpack.drop(['Price','Color'], axis=1)
y = final_backpack['Price']

In [255]:
ordinal_categories = {
    'Waterproof': ['Yes', 'No'],
    'Size': ['Small', 'Medium', 'Large'],
    'Laptop_Compartment': ['Yes', 'No']
}

In [256]:
num_col = X.select_dtypes(include='number').columns.tolist()

In [257]:
to_onehot_col = X.drop(list(to_impute_obj_col + num_col), axis=1).columns.tolist()

In [258]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('onehot encoding', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), to_onehot_col),
    ('ordinal', OrdinalEncoder(
            categories=[ordinal_categories[col] for col in ordinal_categories],
        ), list(ordinal_categories.keys())),
    ('minmaxscaler', MinMaxScaler(), num_col)]
    , remainder="passthrough", verbose_feature_names_out=False
).set_output(transform="pandas")

X_transformed = transformer.fit_transform(X)

In [259]:
X_transformed.head()

Unnamed: 0,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Material_Leather,Material_Nylon,Material_Polyester,Style_Messenger,Style_Tote,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red,Waterproof,Size,Laptop_Compartment,Compartments,Weight_Capacity_(kg),total_revenue_2023
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.666667,0.264469,0.100789
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.883141,0.100789
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.111111,0.46575,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.777778,0.317489,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.509974,0.378834


# Regression

In [260]:
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
import datetime as dt
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
lr = LinearRegression()

In [263]:
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test)


lgb_param = {
    'application':'regression',
    'learning_rate':0.1, 
}

In [262]:
lr.fit(X_train, y_train)

In [264]:
start = dt.datetime.now()
clf = lgb.train(lgb_param, dtrain, 50)
end = dt.datetime.now()
elapsed = end - start

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 305
[LightGBM] [Info] Number of data points in the train set: 2630802, number of used features: 20
[LightGBM] [Info] Start training from score 81.478594


In [265]:
lr.score(X_train, y_train)

0.0009909372212216772

In [266]:
lr.score(X_test, y_test)

0.001064220408680816

In [267]:
y_pred = lr.predict(X_test)
y_pred2 = clf.predict(X_test) 

In [268]:
from sklearn.metrics import root_mean_squared_error

rmse1 = root_mean_squared_error(y_test, y_pred)
rmse2 = root_mean_squared_error(y_test, y_pred2)

In [269]:
print('Root Mean Squared Error : ', rmse1)
print('Root Mean Squared Error : ', rmse2)

Root Mean Squared Error :  38.78882408613481
Root Mean Squared Error :  38.77175422486366


# Submission

In [270]:
X_test_transformed = transformer.transform(imputed_test_backpack)

In [272]:
sub_pred = lr.predict(X_test_transformed)

In [273]:
sub = pd.read_csv('./data/sample_submission.csv')
sub['Price'] = sub_pred
sub.to_csv('submission3.csv', index=False)