In [18]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_squared_log_error

### bonston_dateset

In [19]:
dataset = load_boston()

X= pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['target'])

In [20]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [21]:
y.head()

Unnamed: 0,target
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)

ridge = Ridge()
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [23]:
pred=ridge.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred)) 
print('RMSE: %.3f' % rmse)

RMSE: 5.175


### Preprocessing method（BoW）

In [24]:
text_df = pd.DataFrame({'text':np.array([
        'りんご みかん バナナ', 
        'Go  Python', 
        'みかん アボカド りんご',
        'ぶどう', 
        'Python',
        'バナナ go　バナナ'
        ])
             })

text_df

Unnamed: 0,text
0,りんご みかん バナナ
1,Go Python
2,みかん アボカド りんご
3,ぶどう
4,Python
5,バナナ go　バナナ


In [25]:
# BoWベクトルを作成する関数を定義（インスタンスの作成）

cv = CountVectorizer()

In [26]:
# BoWベクトル作成を実行

cv_vec = cv.fit_transform(text_df['text'])

In [27]:
# BoWベクトルは疎行列で生成される、中身を見るときはarrayに変換する

cv_vec.toarray()

array([[0, 0, 0, 1, 1, 0, 1],
       [1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0],
       [0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 2]], dtype=int64)

In [28]:
# CountVectolizer（BoWベクトル）の特徴量を抽出

col = cv.get_feature_names()
col[:]

['go', 'python', 'ぶどう', 'みかん', 'りんご', 'アボカド', 'バナナ']

In [29]:
# データフレーム化

pd.DataFrame(cv_vec.toarray(), columns=col)

# やっていることは、文字データを単語分解して、ワンホットするようなイメージ

Unnamed: 0,go,python,ぶどう,みかん,りんご,アボカド,バナナ
0,0,0,0,1,1,0,1
1,1,1,0,0,0,0,0
2,0,0,0,1,1,1,0
3,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0
5,1,0,0,0,0,0,2


### Preprocessing method TF-IDF

In [30]:
# インスタンスの作成

tfidf = TfidfVectorizer()

In [31]:
# TF-IDFの計算を実行

tfidf_vec = tfidf.fit_transform(text_df['text'])

In [32]:
# TF-IDFの特徴量を抽出

col = tfidf.get_feature_names()

In [33]:
# 特徴量とTF-IDFをデータフレーム化する

pd.DataFrame(tfidf_vec.toarray(), columns=col)

Unnamed: 0,go,python,ぶどう,みかん,りんご,アボカド,バナナ
0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735
1,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.535506,0.535506,0.653044,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.447214,0.0,0.0,0.0,0.0,0.0,0.894427


### Preprocessing（Target）

In [34]:
train = pd.read_csv('train.tsv', sep='\t')

In [35]:
# 目的値が連続値の場合、誤差が大きくなるので対数化する（最後には戻す）

train.price.head()

0    10.0
1    52.0
2    10.0
3    35.0
4    44.0
Name: price, dtype: float64

In [36]:
# 対数化を行う

np.log1p(train.price.head())

0    2.397895
1    3.970292
2    2.397895
3    3.583519
4    3.806662
Name: price, dtype: float64

In [37]:
# 対数化をもとに戻す

np.expm1(np.log1p(train.price.head()))

0    10.0
1    52.0
2    10.0
3    35.0
4    44.0
Name: price, dtype: float64

In [38]:
# 標準化でも良い

sc = StandardScaler()
y_sc = sc.fit_transform(y)
y_sc[:5]

array([[ 0.15968566],
       [-0.10152429],
       [ 1.32424667],
       [ 1.18275795],
       [ 1.48750288]])

In [39]:
# 標準化をもとに戻す

y_sc = sc.inverse_transform(y_sc)
y_sc[:5]

array([[24. ],
       [21.6],
       [34.7],
       [33.4],
       [36.2]])

### mercari-price-suggestion-challenge

In [40]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')

In [41]:
train.shape
test.shape

(693359, 7)

In [42]:
all_df = pd.concat([train, test])
all_df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175894 entries, 0 to 693358
Data columns (total 9 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  float64
 1   name               2175894 non-null  object 
 2   item_condition_id  2175894 non-null  int64  
 3   category_name      2166509 non-null  object 
 4   brand_name         1247687 non-null  object 
 5   price              1482535 non-null  float64
 6   shipping           2175894 non-null  int64  
 7   item_description   2175890 non-null  object 
 8   test_id            693359 non-null   float64
dtypes: float64(3), int64(2), object(4)
memory usage: 166.0+ MB


In [43]:
all_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,test_id
0,0.0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,
1,1.0,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,
2,2.0,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,
3,3.0,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,
4,4.0,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,


In [44]:
# category_nameのユニーク数を確認する

all_df.category_name.nunique()

1310

In [45]:
# name カラムは BoW（スパース化されている）

cv_name = CountVectorizer()
name_data = cv_name.fit_transform(all_df['name'])

In [46]:
# item_description は TF-IDF（スパース化されている）

all_df['item_description'].fillna(value='missing', inplace=True)

tv = TfidfVectorizer()
description_data = tv.fit_transform(all_df['item_description'])

In [47]:
# category_name は onehot（get_dummies だとメモリオーバーになる、スパース化する）

all_df['category_name'].fillna(value='missing', inplace=True)

lb = LabelBinarizer(sparse_output=True)
category_data = lb.fit_transform(all_df['category_name'])

In [48]:
#　brand_name は onehot（スパース化する）

all_df['brand_name'].fillna(value='missing', inplace=True)

lb = LabelBinarizer(sparse_output=True)
brand_data = lb.fit_transform(all_df['brand_name'])

In [49]:
# こちらの2つは onehot、あとで疎行列を結合するためにスパース化しておく

onehot_cols = ['item_condition_id', 'shipping']
onehot_data = csr_matrix(pd.get_dummies(all_df[onehot_cols], sparse=True))

In [50]:
# 上記でスパース化した特徴量を結合する

X_sparse = hstack((onehot_data, description_data, brand_data, category_data, name_data)).tocsr()

In [51]:
# 目的変数を対数化

y = np.log1p(train["price"])

In [52]:
# trainデータの行数

nrow_train = train.shape[0]
nrow_train

1482535

In [53]:
# データ量が多いので、元データの行数にスケールを合わせる

X = X_sparse[:nrow_train]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)

In [55]:
model = Ridge()
model.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [56]:
# コンペで用意されている評価指標（対数化も元通りになっている）

def get_rmsle(y_test, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_test), np.expm1(y_pred)))

In [57]:
y_pred = model.predict(X_test)

In [59]:
# y_test は train_test_split で存在している

print(get_rmsle(y_test, y_pred))

0.47244140408863133
