In [1]:
# train = train.drop(train[train.price < 3.0].index)
# train = train.drop(train[train.price > 2000.0].index)

# brands = train.brand_name.dropna().unique()

# def get_brand_from_item_name(brand, item_name):
#     if pd.notnull(brand) or not any(x in item_name for x in brands):
#         return brand
#     else:
#         for x in brands:
#             if x in item_name:
#                 return x
            
# train['brand_name'] = train.apply(
#     lambda row: get_brand_from_item_name(row['brand_name'], row['name']), 
#     axis=1
# )

# def remove_brand_from_item_name(item_name):
#     for brand in brands:
#         if brand + ' ' in item_name:
#             return item_name.replace(brand + ' ', '')
#         elif ' ' + brand in item_name:
#             return item_name.replace(' ' + brand, '')
#         elif ' ' + brand + ' ' in item_name:
#             return item_name.replace(' ' + brand + ' ', '')
        
# # train['name'] = train.apply

In [2]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_log_error
import psycopg2

In [3]:
train = pd.read_table('train.tsv')

In [4]:
y_train = np.log1p(train['price'])
train['category_name'] = train['category_name'].fillna('Other').astype(str)
train['brand_name'] = train['brand_name'].fillna('missing').astype(str)
train['shipping'] = train['shipping'].astype(str)
train['item_condition_id'] = train['item_condition_id'].astype(str)
train['item_description'] = train['item_description'].fillna('None')

In [5]:
%%time

# we need a custom pre-processor to extract correct field,
# but want to also use default scikit-learn preprocessing (e.g. lowercasing)
default_preprocessor = CountVectorizer().build_preprocessor()
def build_preprocessor(field):
    field_idx = list(train.drop(['train_id', 'price'], axis=1).columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])

vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),
        max_features=50000,
        preprocessor=build_preprocessor('name'))),
    ('category_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('category_name'))),
    ('brand_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('brand_name'))),
    ('shipping', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('shipping'))),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('item_condition_id'))),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=100000,
        preprocessor=build_preprocessor('item_description'))),
])

X_train = vectorizer.fit_transform(train.drop(['train_id', 'price'], axis=1).values)

CPU times: user 10min 3s, sys: 4min 45s, total: 14min 48s
Wall time: 3h 14min 38s


In [7]:
import pickle

pickle.dumps(build_preprocessor)

b'\x80\x03c__main__\nbuild_preprocessor\nq\x00.'

In [8]:
pkl_filename = 'vectorizer.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(vectorizer, file)

AttributeError: Can't pickle local object 'build_preprocessor.<locals>.<lambda>'

In [10]:
import _pickle as cPickle
with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(vectorizer, fin)

AttributeError: Can't pickle local object 'build_preprocessor.<locals>.<lambda>'

In [7]:
%%time

def get_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

# cv = KFold(n_splits=10, shuffle=True, random_state=42)
# for train_ids, valid_ids in cv.split(X_train):
#     model = Ridge(
#         solver='auto',
#         fit_intercept=True,
#         alpha=0.5,
#         max_iter=100
#         normalize=False,
#         tol=0.05
#     )
#     model.fit(X_train[train_ids], y_train[train_ids])
#     y_pred_valid = model.predict(X_train[valid_ids])
#     rmsle = get_rmsle(y_predict_valid, y_train[valid_ids])
#     print(f'valid rmsle: {rmsle.5f}')
#     break

model = linear_model.Ridge(
    solver='auto',
    fit_intercept=True,
    alpha=0.5,
    max_iter=100,
    normalize=False,
    tol=0.05
)

model.fit(X_train, y_train)

CPU times: user 51.2 s, sys: 2.71 s, total: 53.9 s
Wall time: 1min 47s


In [8]:
get_rmsle(y_train, model.predict(X_train))

0.4233332155616298

In [11]:
import pickle

pkl_filename = 'model.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [15]:
pkl_filename = 'vectorizer.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(vectorizer, file)

AttributeError: Can't pickle local object 'build_preprocessor.<locals>.<lambda>'