In [27]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import gc
from sklearn.model_selection import train_test_split
from kernel import Evaluator
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline

In [2]:
from sklearn.tree import DecisionTreeRegressor
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5)

In [40]:
print('Loading Dataset...')
train = pd.read_csv("train.tsv", sep='\t')
test = pd.read_csv("test.tsv", sep='\t')
print('Dataset loaded!...')

Loading Dataset...
Dataset loaded!...


In [41]:
merge: pd.DataFrame = pd.concat([train, test])
submission: pd.DataFrame = test[['test_id']]
print(submission.head())

   test_id
0        0
1        1
2        2
3        3
4        4


In [42]:
del train
del test
gc.collect()

386

In [43]:
def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)

In [44]:
handle_missing_inplace(merge)

In [45]:
lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])

In [34]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer(sparse_output = True)
lb.fit([1, 2, 6, 4, 2])

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True)

In [35]:
lb.classes_

array([1, 2, 4, 6])

In [39]:
result = lb.transform([1, 6])
print(result)

  (0, 0)	1
  (1, 3)	1


In [24]:
print(X_brand[:10])
print(merge.head())

  (0, 5265)	1
  (1, 3889)	1
  (2, 4588)	1
  (3, 5265)	1
  (4, 5265)	1
  (5, 5265)	1
  (6, 84)	1
  (7, 4341)	1
  (8, 3337)	1
  (9, 5265)	1
  brand_name                                      category_name  \
0    missing                                  Men/Tops/T-shirts   
1      Razer  Electronics/Computers & Tablets/Components & P...   
2     Target                        Women/Tops & Blouses/Blouse   
3    missing                 Home/Home Décor/Home Décor Accents   
4    missing                            Women/Jewelry/Necklaces   

   item_condition_id                                   item_description  \
0                  3                                 No description yet   
1                  3  This keyboard is in great condition and works ...   
2                  1  Adorable top with a hint of lace and a key hol...   
3                  1  New with tags. Leather horses. Retail for [rm]...   
4                  1          Complete with certificate of authenticity   

        

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'
X = train.loc[:, train.columns!='price']
y = train.loc[:, ['price']]

X.loc[:,['category_name']] = pd.Categorical(X.loc[:,'category_name'])
X['category_code'] = X.category_name.cat.codes
X.loc[:,['brand_name']] = pd.Categorical(X.loc[:,'brand_name'])
X['brand_code'] = X.brand_name.cat.codes

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [6]:
B_train = X_train.loc[:, ['shipping', 'item_condition_id', 'brand_code', 'category_code']]
B_test = X_test.loc[:, ['shipping', 'item_condition_id', 'brand_code', 'category_code']]
print(len(B_train['brand_code'].unique()), len(B_test['brand_code'].unique()))
print(len(B_train['category_code'].unique()), len(B_test['category_code'].unique()))
X_train.columns
B_train.head()

4327 3499
1248 1185


Unnamed: 0,shipping,item_condition_id,brand_code,category_code
223243,1,1,1664,683
363758,1,1,-1,1137
1358119,0,1,-1,1137
198564,0,3,3057,559
71297,0,1,2604,1248


In [146]:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=20)
regr_1.fit(B_train, np.log1p(y_train))
regr_2.fit(B_train, np.log1p(y_train))
print(regr_1.feature_importances_)
print(regr_2.feature_importances_)

[ 0.62251124  0.          0.37748876  0.        ]
[ 0.1090445   0.08206338  0.31438133  0.49451079]


In [147]:
train_predict_1 = regr_1.predict(B_train)
train_predict_2 = regr_2.predict(B_train)
ev_train = Evaluator(y_train.values)
train_score_1 = ev_train.rmsle_loop(np.expm1(train_predict_1))
train_score_2 = ev_train.rmsle_loop(np.expm1(train_predict_2))
print(train_score_1, train_score_2)

0.7164622708120116 0.5378310384111897


In [148]:
y_1 = regr_1.predict(B_test)
y_2 = regr_2.predict(B_test)
ev = Evaluator(y_test.values)
# print(dir(ev))
score = ev.rmsle_loop(np.expm1(y_1))
score_2 = ev.rmsle_loop(np.expm1(y_2))
print(score, score_2)

0.7175152210923533 0.5687757302393023


In [7]:
Bd_train = pd.get_dummies(B_train, columns=['category_code'], sparse=True).astype(np.int8)
# Bd_test = pd.get_dummies(B_test, columns=['brand_code'])


In [9]:
Bd_train.head()

Unnamed: 0,shipping,item_condition_id,brand_code,category_code_-1,category_code_0,category_code_1,category_code_2,category_code_3,category_code_4,category_code_5,...,category_code_1277,category_code_1278,category_code_1279,category_code_1280,category_code_1281,category_code_1282,category_code_1283,category_code_1284,category_code_1285,category_code_1286
223243,1,1,-128,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
363758,1,1,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1358119,0,1,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198564,0,3,-15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71297,0,1,44,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=20)
regr_1.fit(Bd_train, np.log1p(y_train))
regr_2.fit(Bd_train, np.log1p(y_train))
print(regr_1.feature_importances_)
print(regr_2.feature_importances_)