In [None]:
#https://www.codexa.net/kaggle-mercari-price-suggestion-challenge/
#https://www.kaggle.com/c/mercari-price-suggestion-challenge

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

from IPython.display import display
pd.set_option('display.float_format', lambda x:'%.5f' % x)

In [None]:
#解凍の仕方は別で検索

#データタイプを指定
types_dict_train = {'train_id':'int64', 'item_condition_id':'int8', 'price':'float64', 'shipping':'int8'}
types_dict_test = {'test_id':'int64', 'item_condition_id':'int8', 'shipping':'int8'}
#tsvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('train.tsv', delimiter='\t', low_memory=True, dtype=types_dict_train)
test = pd.read_csv('test.tsv', delimiter='\t', low_memory=True, dtype=types_dict_test)

In [None]:
#trainとtestの確認&要素の確認
train.head()
test.head()
#サイズを確認
train.shape
test.shape

In [None]:
#表示関数
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)

#trainの基本統計量を表示
display_all(train.describe(include='all').transpose())

In [None]:
#カテゴリ名、商品説明、投稿タイトル、ブランド名のデータタイプを「category」へ変換
train.category_name = train.category_name.astype('category')
train.item_description = train.item_description.astype('category')
train.name = train.name.astype('category')
train.brand_name = train.brand_name.astype('category')

test.category_name = test.category_name.astype('category')
test.item_description = test.item_description.astype('category')
test.name = test.name.astype('category')
test.brand_name = test.brand_name.astype('category')
 
#データ形式の確認
train.dtypes
test.dtypes

In [None]:
#ユニークな値の確認
train.apply(lambda x: x.nunique())

test.apply(lambda x: x.nunique())

In [None]:
#欠損データの個数と割合
train.isnull().sum()
train.isnull().sum()/train.shape[0]

test.isnull().sum()
test.isnull().sum()/test.shape[0]

In [None]:
#ここから事前処理

#trainとtestのデータを連結させる
#連結させたDataFrameの文字列のデータ形式を「cateogry」へ変換
#文字列を数値へ値を変換
#訓練用データの「price」をnp.log()で処理
#ランダムフォレスト用にxとy（ターゲット）で分ける

In [None]:
#idを変更
train = train.rename(columns = {'train_id':'id'})
test = test.rename(columns = {'test_id':'id'})
 
#TrainとTestを戻せるようコラム追加
train['is_train'] = 1
test['is_train'] = 0
 
#連結(Priceは除く)
train_test_combine = pd.concat([train.drop(['price'], axis=1),test],axis=0)
 
#確認
train_test_combine.head()
train_test_combine.shape

In [None]:
#文字列のデータタイプを「category」へ変換
#結合したものは別個で変換する必要がある
train_test_combine.category_name = train_test_combine.category_name.astype('category')
train_test_combine.item_description = train_test_combine.item_description.astype('category')
train_test_combine.name = train_test_combine.name.astype('category')
train_test_combine.brand_name = train_test_combine.brand_name.astype('category')
 
#文字列を数値へ変換する
#.cat.codesで数値変換ができる
train_test_combine.name = train_test_combine.name.cat.codes
train_test_combine.category_name = train_test_combine.category_name.cat.codes
train_test_combine.brand_name = train_test_combine.brand_name.cat.codes
train_test_combine.item_description = train_test_combine.item_description.cat.codes
 
#確認
train_test_combine.head()
train_test_combine.dtypes

In [None]:
#testとtrainを分割する
df_test = train_test_combine.loc[train_test_combine['is_train'] == 0]
df_train = train_test_combine.loc[train_test_combine['is_train'] == 1]
 
#判別用に作ったコラムを削除
df_test = df_test.drop(['is_train'], axis=1)
df_train = df_train.drop(['is_train'], axis=1)
 
#確認
df_test.shape
df_train.shape

In [None]:
#Priceを戻す
df_train['price'] = train.price
 
#正規化
df_train['price'] = df_train['price'].apply(lambda x: np.log(x) if x>0 else x)
 
#確認
df_train.head()

In [None]:
# x ＝ price以外の全ての値、y = price（ターゲット）で切り分ける
x_train = df_train.drop(['price'], axis=1)
y_train = df_train.price

# モデルの作成
m = RandomForestRegressor(n_jobs=-1, min_samples_leaf=5, n_estimators=200)
m.fit(x_train, y_train)
 
# スコアを表示
m.score(x_train, y_train)

In [None]:
#予測する
preds = m.predict(df_test)
 
#正規化を元に戻す
np.exp(preds)
 
#Numpy配列からpandasシリーズへ変換
preds = pd.Series(np.exp(preds))
 
#IDと予測値を連結
submit = pd.concat([df_test.id, preds], axis=1)
 
#メルカリの提出指定の名前をつける
submit.columns = ['test_id', 'price']
 
#CSVへ書き出し
submit.to_csv('submit_rf_base.csv', index=False)
 