### 載入套件

In [1]:
import gc # garbage collection
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #繪圖
import seaborn as sns #繪圖
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

### 讀入資料集檔案

In [3]:
df = pd.read_csv('train.tsv', sep = '\t')

Randomly split the data into train and test sets

In [4]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [None]:
train.shape, test.shape

## Exploratory Data Analysis

using training set only for EDA

In [None]:
train.head() #查看前五筆資料

In [12]:
train.info() #資料的各欄位

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1185670 entries, 0 to 1482534
Data columns (total 8 columns):
train_id             1185670 non-null int64
name                 1185670 non-null object
item_condition_id    1185670 non-null int64
category_name        1180643 non-null object
brand_name           679696 non-null object
price                1185670 non-null float64
shipping             1185670 non-null int64
item_description     1185666 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 81.4+ MB


In [None]:
train_id：資料編號
name：商品名稱
item_condition_id：品項狀態
category_name：類別名稱
brand_name：品牌名稱
price：價格
shipping：運送費用的支付者(0:，1:)
item_description：

### 商品資料

In [9]:
train.name.value_counts()[:20]

Bundle                  1795
Reserved                 369
Converse                 362
Coach purse              336
BUNDLE                   336
Dress                    326
Lularoe TC leggings      308
Nike                     273
Romper                   266
Vans                     263
American Eagle Jeans     261
Miss Me Jeans            239
Lularoe OS leggings      230
ON HOLD                  213
Lularoe Irma             207
Coach Purse              203
Michael Kors Wallet      197
Shorts                   197
Miss me jeans            192
Bundle!                  192
Name: name, dtype: int64

In [16]:
train.groupby('name').train_id.nunique().reset_index().sort_values('train_id', ascending=False)[:20]

Unnamed: 0,name,train_id
188614,Bundle,1795
739505,Reserved,369
238886,Converse,362
122610,BUNDLE,336
233252,Coach purse,336
272761,Dress,326
499313,Lularoe TC leggings,308
629146,Nike,273
748059,Romper,266
886045,Vans,263


In [None]:
### 品項狀態資料

In [17]:
train.item_condition_id.value_counts()

1    511881
3    345813
2    300325
4     25729
5      1922
Name: item_condition_id, dtype: int64

In [19]:
train.groupby('item_condition_id').train_id.nunique().reset_index()

Unnamed: 0,item_condition_id,train_id
0,1,511881
1,2,300325
2,3,345813
3,4,25729
4,5,1922


### 類別資訊

### 品牌資訊

### Descriptive statistics for price

In [None]:
train.price.describe()

In [None]:
'''
train.loc[train.price<=250].price 資料中價格小於等於250者的價格
sna.distplot(x, kde=False, bins=50)不需畫出density，bins為50
'''
sns.set()
plt.subplot(1, 2, 1)
sns.distplot(train.loc[train.price<=250].price, kde=False, bins=np.arange(0, 250, 5))
plt.subplot(1, 2, 2)
sns.distplot(np.log(train.price+1), kde=False, bins=50)

In [None]:
plt.subplot(1, 2, 1)
(train['price']).plot.hist(bins=50, figsize=(12, 6), edgecolor = 'white', range = [0, 250])
plt.xlabel('price', fontsize=12)
plt.title('Price Distribution', fontsize=12)
plt.subplot(1, 2, 2)
np.log(train['price']+1).plot.hist(bins=50, figsize=(12,6), edgecolor='white')
plt.xlabel('log(price+1)', fontsize=12)
plt.title('Price Distribution', fontsize=12)

In [None]:
train['shipping'].value_counts() / len(train)

In [None]:
shipping_fee_by_buyer = train.loc[df['shipping'] == 0, 'price']
shipping_fee_by_seller = train.loc[df['shipping'] == 1, 'price']

In [None]:
fig, ax = plt.subplots(figsize=(18,8))
ax.hist(shipping_fee_by_seller, color='#8CB4E1', alpha=1.0, bins=50, range = [0, 100],
       label='Price when Seller pays Shipping')
ax.hist(shipping_fee_by_buyer, color='#007D00', alpha=0.7, bins=50, range = [0, 100],
       label='Price when Buyer pays Shipping')
plt.xlabel('price', fontsize=12)
plt.ylabel('frequency', fontsize=12)
plt.title('Price Distribution by Shipping Type', fontsize=15)
plt.tick_params(labelsize=12)
plt.legend()

In [None]:
print('The average price is {}'.format(round(shipping_fee_by_seller.mean(), 2)), 'if seller pays shipping');
print('The average price is {}'.format(round(shipping_fee_by_buyer.mean(), 2)), 'if buyer pays shipping')

In [None]:
fig, ax = plt.subplots(figsize=(18,8))
ax.hist(np.log(shipping_fee_by_seller+1), color='#8CB4E1', alpha=1.0, bins=50,
       label='Price when Seller pays Shipping')
ax.hist(np.log(shipping_fee_by_buyer+1), color='#007D00', alpha=0.7, bins=50,
       label='Price when Buyer pays Shipping')
plt.xlabel('log(price+1)', fontsize=12)
plt.ylabel('frequency', fontsize=12)
plt.title('Price Distribution by Shipping Type', fontsize=15)
plt.tick_params(labelsize=12)
plt.legend()

In [None]:
print('There are', train['category_name'].nunique(), 'unique values in category name column')

In [None]:
train['category_name'].value_counts()[:10]