In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('08/mercari/train.tsv', sep='\t')
print(df.shape)
df.head()

(1482535, 8)


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [3]:
def split_cat(category_name):
    try:
        return category_name.split('/') #결측값이 아니라면 '/'기호 기준으로 split하여 3개 칼럼반환
    except:
        return ['Other_Null','Other_Null','Other_Null'] #결측값이라면 전부(3개 칼럼 모두) 'Other_Null' 을 반환

In [4]:
#위의 함수 적용: zip과 *를 apply lambda 식에 적용! ==>칼럼 3개가 반환됨.
df['cat_dae'], df['cat_jung'], df['cat_so'] = zip(*df['category_name'].apply(lambda x: split_cat(x)))
df.head(3)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,cat_dae,cat_jung,cat_so
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse


In [None]:
print(df['cat_dae'].nunique())
print(df['cat_jung'].nunique())
print(df['cat_so'].nunique())

11
114
871


In [None]:
#결측값 처리 -category, brand_name, item_description 전부 null값을 'Other_Null'로 동일하게 변경!
df['category_name'].fillna('Other_Null', inplace=True) #아까 대/중/소 칼럼은 결측값의 경우 'Other Null'로 되어있으므로 이 세개 칼럼에 대해선 노상관.
df['brand_name'].fillna('Other_Null', inplace=True)
df['item_description'].fillna('Other_Null', inplace=True)

In [None]:
df.isnull().sum()

train_id             0
name                 0
item_condition_id    0
category_name        0
brand_name           0
price                0
shipping             0
item_description     0
cat_dae              0
cat_jung             0
cat_so               0
dtype: int64

- 문자열 전처리 일괄적용

In [None]:
#피처벡터화(name: count, item_description: tfidf)
cnt_vect = CountVectorizer()
X_name = cnt_vect.fit_transform(df['name'])
print(X_name.shape)

(1482535, 105757)


In [None]:
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,3), stop_words='english') #책에선 max_features=50000인데 너무 오래 걸려서 만으로 줄임
X_desc = tfidf.fit_transform(df['item_description'])
print(X_desc.shape)

In [None]:
#원핫인코딩(원래 사용하던 OneHotEncoder 외에 LabelBinarizer를 사용하면 원-핫인코딩+희소행렬 변환 가능!)
    #왜냐면 피처벡터결과 희소행렬 & 원핫인코딩결과 희소행렬을 나중에 hstack으로 붙이기 위함.(모두 희소행렬로만 만들기 위해)
from sklearn.preprocessing import LabelBinarizer

#brand_name, item_condition_id, shipping, dae/jung/so에 대해 진행
lb_brand = LabelBinarizer(sparse_output=True) #sparse_output=True면 희소행렬 반환한다는 뜻
X_brand = lb_brand.fit_transform(df['brand_name'])

lb_con = LabelBinarizer(sparse_output=True) 
X_con = lb_con.fit_transform(df['item_condition_id'])

lb_ship = LabelBinarizer(sparse_output=True) 
X_ship = lb_ship.fit_transform(df['shipping'])

lb_dae = LabelBinarizer(sparse_output=True) 
X_dae = lb_dae.fit_transform(df['cat_dae'])

lb_jung = LabelBinarizer(sparse_output=True) 
X_jung = lb_jung.fit_transform(df['cat_jung'])

lb_so = LabelBinarizer(sparse_output=True) 
X_so = lb_so.fit_transform(df['cat_so'])

#반환결과, 모든 type은 csr matrix임. 하나만 보자
print(type(X_brand))

#나머지 shape을 출력
print(X_brand.shape, X_con.shape, X_ship.shape)
print(X_dae.shape, X_jung.shape, X_so.shape)

In [None]:
#만들어진 피처벡터+원핫인코딩벡터를 hstack으로 결합
import gc
gc.collect()

In [None]:
from scipy.sparse import hstack
import gc

sparse_matrix_list = (X_name, X_desc, X_brand, X_con, X_ship, X_dae, X_jung, X_so) #아까 만든 것들 리스트
X_sparse = hstack(sparse_matrix_list).tocsr() #총 결합 피처
print(type(X_sparse), X_sparse.shape)

#위에 출력만 하고 이게 메모리를 많이 차지하므로 사용목적 끝났으면 바로 삭제
del X_sparse
gc.collect()

### 머신러닝 회귀 모델에 적용 - 릿지, LightGBM
- 릿지

In [None]:
#평가지표: RMSLE
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(pred), 2)))

#우리가 앞에서 타겟값인 'price'에 로그변환을 했었으므로, 지도학습 시 실제값인 y_test뿐 아니라 예측된 pred 값또한 로그변환된 값이 반환됨.
#원본으로 지수변환시킨 후, RMSLE 함수를 적용해야 함
def evaluate_org_price(y_test, preds):
    #로그변환값->원래값으로 변환.
    pred_exp = np.expm1(preds)
    y_test_exp = np.expm1(y_test)
    
    #다시 위의 함수를 적용
    result = rmsle(y_test_exp, pred_exp)
    rmsle_result = np.round(result, 3)
    return rmsle_result

In [None]:
#회귀- 릿지(지도학습) 하는 함수
import gc
gc.collect()
from scipy.sparse import hstack

def model_train_predict(model, X_list):
    
    X = hstack(X_list).tocsr() #들어온 X_list들을 희소행렬화
    X_train, X_test, y_train, y_test = train_test_split(X, df['price'], test_size=.2, random_state=156)
    
    #모델 학습 및 예측
    #객체-이미 만들어져있음
    model.fit(X_train,y_train)
    preds = model.predict(X_test)
    
    #메모리 삭제(용량 많이 잡아먹어서,,ㅠㅠ)
    del X, X_train, X_test, y_train
    gc.collect()
    
    return y_test, preds #반환값은 실제값(y_test)와 예측값(preds)만
    

In [None]:
#객체
linear_model = Ridge(solver='lsqr', fit_intercept=False)

#desc 포함 안한 경우 rmsle
X_list = (X_name, X_brand, X_con, X_ship, X_dae, X_jung, X_so)
y_test, preds = model_train_predict(linear_model, X_list)
print(evaluate_org_price(y_test, preds))

In [None]:
#desc 포함한 경우 rmsle
X_list = (X_desc, X_name, X_brand, X_con, X_ship, X_dae, X_jung, X_so)
y_test, ridge_preds = model_train_predict(linear_model, X_list) #여기 결과는 ridge_preds
print(evaluate_org_price(y_test, ridge_preds))

In [None]:
import gc
gc.collect()

- LightGBM

In [None]:
from lightgbm import LGBMRegressor

X_list = (X_desc, X_name, X_brand, X_con, X_ship, X_dae, X_jung, X_so)
#객체-n_estimators를 많이 늘리면 좋지만 메모리 문제로 인해 200으로만 진행
lgbm = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125, random_state=156) 
#rmsle 결과
y_test, lgbm_preds = model_train_predict(lgbm, X_list) #여기 결과는 lgbm_preds
print(evaluate_org_price(y_test, lgbm_preds))

- 앙상블 모델(릿지:0.55, lgbm:0.45 가중치) ==> 최종 성능 가장 뛰어남!

In [None]:
preds = lgbm_preds*0.45 + ridge_preds*0.55
print(evaluate_org_price(y_test, preds))