In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [5]:
train = pd.read_csv('data/train.csv')

In [6]:
train.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [10]:
train.isna().sum() / len(train) * 100

id                      0.000000
Brand                   3.235000
Material                2.782333
Size                    2.198333
Compartments            0.000000
Laptop Compartment      2.481333
Waterproof              2.350000
Style                   2.656667
Color                   3.316667
Weight Capacity (kg)    0.046000
Price                   0.000000
dtype: float64

In [36]:
[(i, train[i].unique()) if train[i].nunique() <= 10 else (i, train[i].nunique()) for i in train.columns]

[('id', 300000),
 ('Brand',
  array(['Jansport', 'Under Armour', 'Nike', 'Adidas', 'Puma', nan],
        dtype=object)),
 ('Material',
  array(['Leather', 'Canvas', 'Nylon', nan, 'Polyester'], dtype=object)),
 ('Size', array(['Medium', 'Small', 'Large', nan], dtype=object)),
 ('Compartments', array([ 7., 10.,  2.,  8.,  1.,  3.,  5.,  9.,  6.,  4.])),
 ('Laptop Compartment', array(['Yes', 'No', nan], dtype=object)),
 ('Waterproof', array(['No', 'Yes', nan], dtype=object)),
 ('Style', array(['Tote', 'Messenger', nan, 'Backpack'], dtype=object)),
 ('Color',
  array(['Black', 'Green', 'Red', 'Blue', 'Gray', 'Pink', nan], dtype=object)),
 ('Weight Capacity (kg)', 181596),
 ('Price', 48212)]

In [42]:
train.Style.mode()

0    Messenger
Name: Style, dtype: object

In [50]:
[train[i].mode() for i in train.columns.drop('id')]

[0    Adidas
 Name: Brand, dtype: object,
 0    Polyester
 Name: Material, dtype: object,
 0    Medium
 Name: Size, dtype: object,
 0    1.0
 Name: Compartments, dtype: float64,
 0    Yes
 Name: Laptop Compartment, dtype: object,
 0    Yes
 Name: Waterproof, dtype: object,
 0    Messenger
 Name: Style, dtype: object,
 0    Pink
 Name: Color, dtype: object,
 0    5.0
 Name: Weight Capacity (kg), dtype: float64,
 0    150.0
 Name: Price, dtype: float64]

In [51]:
X_train = train.drop('Price', axis=1)
y_train = train.Price

In [52]:
X_train.columns

Index(['id', 'Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)'],
      dtype='object')

In [53]:
X_train.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338


In [54]:
class deleteId(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_new = X.copy()
        X_new.drop(['id'], axis=1, inplace=True)
        
        return X_new

In [58]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [64]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  
])

In [65]:
numerical_cols = ['Compartments', 'Weight Capacity (kg)']  
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color']

In [66]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols), 
        ('cat', categorical_transformer, categorical_cols)  
    ]
)

In [70]:
pipeline = Pipeline([
    ('deleteId', deleteId()),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor())
])

In [71]:
pipeline.fit(X_train, y_train)

In [None]:
# Define a parameter grid for hyperparameter tuning
param_grid = {
    'regressor': [RandomForestRegressor(), LinearRegression(), SVR()],
}

# Setup GridSearchCV with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


In [None]:
# Perform the grid search (this has already been done in your previous code)
grid_search.fit(X_train, y_train)

# Retrieve the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred = best_model.predict(X_test)

# Print or evaluate predictions
print("Predictions: ", y_pred)


In [72]:
test = pd.read_csv('data/test.csv')

In [73]:
test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [74]:
y_test = pipeline.predict(test)

In [75]:
result = pd.DataFrame(y_test, columns=['Price'])

In [76]:
test.join(result)[['id', 'Price']].to_csv('submission.csv', index=False)