### https://dphi.tech/practice/challenge/45#data

In [1]:
import pandas as pd
import numpy as np

df_train  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/electronic_product/electronic_product/Training_set_label.csv",
                        parse_dates=['dateAdded', 'dateUpdated'], index_col='id')
X_test = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/electronic_product/electronic_product/Testing_set_label.csv', 
                      parse_dates=['dateAdded', 'dateUpdated'], index_col='id')

In [2]:
# Remove 'ean', 'manufacturer', and 'prices.shipping', 'prices.sourceURLs' because they have high missing value percentage and values have nothing to do in predicting prices
def remove_cols(df):
    df.drop(columns=['manufacturer', 'prices.shipping', 'prices.sourceURLs', 'asins', 'imageURLs', 'sourceURLs', 'upc', 'prices.dateSeen', 'keys', 'name', 'weight', 'categories'], inplace=True)
remove_cols(df_train)
remove_cols(X_test)

In [3]:
# I may or may not use this column
# df_train['prices.merchant'].unique()

In [4]:
# Convert all to USD and prices
def covert_cur(df):
    df.loc[df['prices.currency'] == 'CAD', 'price'] *= 0.79
    df.loc[df['prices.currency'] == 'CAD', 'prices.currency'] = 'USD'
    df.drop(columns='prices.currency', inplace=True)
    
covert_cur(df_train)
# As X_test has one value in prices.currency column, I'll drop it
X_test.drop(columns = 'prices.currency', inplace=True)

In [5]:
def clean_data(df):
    df.loc[df['prices.availability'].str.lower().isin(['yes', 'true', 'in stock', '7 available']), 'prices.availability'] = 'yes'
    df.loc[df['prices.availability'].str.lower().isin(['no', 'false', 'sold', 'out of stock']), 'prices.availability'] = 'no'
    df.loc[df['prices.condition'].str.contains('[Rr]efurbished'), 'prices.condition'] = 'refurbished'
    df.loc[df['prices.condition'].str.lower().isin(['used', 'pre-owned']), 'prices.condition'] = 'used'
    df.loc[df['prices.condition'].str.contains('[Nn]ew'), 'prices.condition'] = 'new'
    # As electronic merchandise changes price mostly yearly, I will convert them to year
    df['dateAdded'] = df['dateAdded'].dt.year
    # As values are in 2018, I filter months 
    df['dateUpdated'] = df['dateUpdated'].dt.month
    df['prices.condition'] = df['prices.condition'].map({'new': 0, 'refurbished': 1, 'used': 2})
    
clean_data(df_train)
clean_data(X_test)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5436 entries, AVphrugr1cnluZ0-FOeH to AVpfHcah1cnluZ0-eQLY
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   prices.availability  5436 non-null   object 
 1   prices.condition     5436 non-null   int64  
 2   prices.isSale        5436 non-null   bool   
 3   prices.merchant      5436 non-null   object 
 4   brand                5436 non-null   object 
 5   dateAdded            5436 non-null   int64  
 6   dateUpdated          5436 non-null   int64  
 7   ean                  1175 non-null   object 
 8   manufacturerNumber   5436 non-null   object 
 9   primaryCategories    5436 non-null   object 
 10  price                5436 non-null   float64
dtypes: bool(1), float64(1), int64(3), object(6)
memory usage: 472.5+ KB


In [7]:
y_train = df_train['price'].copy()
X_train = df_train.drop(columns='price')

In [8]:
num_cols = X_train.select_dtypes(include='number').columns
cat_cols = X_train.select_dtypes(exclude='number').columns

In [9]:
num_cols

Index(['prices.condition', 'dateAdded', 'dateUpdated'], dtype='object')

In [10]:
cat_cols

Index(['prices.availability', 'prices.isSale', 'prices.merchant', 'brand',
       'ean', 'manufacturerNumber', 'primaryCategories'],
      dtype='object')

In [11]:
X_train.shape

(5436, 10)

In [12]:
for col in cat_cols:
    if X_train[col].nunique() > 10:
        print(col, X_train[col].nunique())

prices.merchant 587
brand 254
ean 117
manufacturerNumber 819


In [13]:
for col in cat_cols:
    if X_train[col].nunique() <= 10:
        print(col, X_train[col].unique())

prices.availability ['yes' 'More on the Way' 'Special Order' 'no' 'undefined' 'Retired']
prices.isSale [False  True]
primaryCategories ['Electronics' ' Intel Celeron' 'Electronics,Furniture' ' Siri Eyes Free'
 ' Apple CarPlay']


In [14]:
X_test.shape

(1666, 10)

In [15]:
X_train.shape

(5436, 10)

In [16]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

cat_pipe = make_pipeline(
            (SimpleImputer(strategy='most_frequent')),
            (OneHotEncoder(drop='first', handle_unknown='error'))
            )
num_pipe = make_pipeline(
            (IterativeImputer()),
            (StandardScaler())
            )            
preprocess_pipeline = make_column_transformer(
            (cat_pipe, cat_cols),
            (num_pipe, num_cols)
            )

preprocessor =  preprocess_pipeline.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

ValueError: Found unknown categories ['32 available'] in column 0 during transform

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train)