In [1]:
%matplotlib inline 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# データの読み込み
df = pd.read_csv('../data/recipeData.csv')
df.head()

Unnamed: 0,BeerID,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
0,1,Vanilla Cream Ale,/homebrew/recipe/view/1633/vanilla-cream-ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,...,1.038,70.0,,Specific Gravity,All Grain,,17.78,corn sugar,4.5 oz,116.0
1,2,Southern Tier Pumking clone,/homebrew/recipe/view/16367/southern-tier-pumk...,Holiday/Winter Special Spiced Beer,85,20.82,1.083,1.021,8.16,60.65,...,1.07,70.0,,Specific Gravity,All Grain,,,,,955.0
2,3,Zombie Dust Clone - EXTRACT,/homebrew/recipe/view/5920/zombie-dust-clone-e...,American IPA,7,18.93,1.063,1.018,5.91,59.25,...,,70.0,,Specific Gravity,extract,,,,,
3,4,Zombie Dust Clone - ALL GRAIN,/homebrew/recipe/view/5916/zombie-dust-clone-a...,American IPA,7,22.71,1.061,1.017,5.8,54.48,...,,70.0,,Specific Gravity,All Grain,,,,,
4,5,Bakke Brygg Belgisk Blonde 50 l,/homebrew/recipe/view/89534/bakke-brygg-belgis...,Belgian Blond Ale,20,50.0,1.06,1.01,6.48,17.84,...,1.05,72.0,,Specific Gravity,All Grain,,19.0,Sukkerlake,6-7 g sukker/l,18325.0


In [3]:
# 取り扱い種類数の多い10スタイルに限定。
df.groupby(['Style'])['Style'].count()\
                     .reset_index(name='count')\
                     .sort_values(['count'], ascending=False)\
                     .reset_index(drop=True)\
                     .head(10)
                             

Unnamed: 0,Style,count
0,American IPA,11940
1,American Pale Ale,7581
2,Saison,2617
3,American Light Lager,2277
4,American Amber Ale,2038
5,Blonde Ale,1753
6,Imperial IPA,1478
7,American Stout,1268
8,Irish Red Ale,1204
9,American Brown Ale,1152


In [4]:
new_df = df.query('StyleID in (92, 6, 12, 86, 30, 4, 9, 134, 10, 7)')
new_df.head()

Unnamed: 0,BeerID,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
2,3,Zombie Dust Clone - EXTRACT,/homebrew/recipe/view/5920/zombie-dust-clone-e...,American IPA,7,18.93,1.063,1.018,5.91,59.25,...,,70.0,,Specific Gravity,extract,,,,,
3,4,Zombie Dust Clone - ALL GRAIN,/homebrew/recipe/view/5916/zombie-dust-clone-a...,American IPA,7,22.71,1.061,1.017,5.8,54.48,...,,70.0,,Specific Gravity,All Grain,,,,,
5,6,Sierra Nevada Pale Ale Clone,/homebrew/recipe/view/28546/sierra-nevada-pale...,American Pale Ale,10,24.61,1.055,1.013,5.58,40.12,...,1.047,79.0,,Specific Gravity,All Grain,1.0,,,,5889.0
6,7,Russian River Pliny the Elder (original),/homebrew/recipe/view/37534/russian-river-plin...,Imperial IPA,86,22.71,1.072,1.018,7.09,268.71,...,,75.0,,Specific Gravity,All Grain,,,,,1051.0
9,10,Mango Habanero IPA,/homebrew/recipe/view/61082/mango-habanero-ipa,Imperial IPA,86,20.82,1.08,1.017,8.22,93.02,...,1.058,70.0,,Specific Gravity,All Grain,,21.11,Corn Sugar,4.6 oz / .66 C,


In [5]:
new_df.Style.unique()

array(['American IPA', 'American Pale Ale', 'Imperial IPA', 'Saison',
       'Blonde Ale', 'American Brown Ale', 'American Amber Ale',
       'American Stout', 'Irish Red Ale', 'American Light Lager'],
      dtype=object)

In [6]:
# 新しく振り直したIDリストを作成、出力
ids = pd.DataFrame(np.arange(1,11).reshape(-1,1))
new_df_ids = new_df[~new_df.duplicated(subset='Style', keep='first')].loc[:,['Style', 'StyleID']].reset_index()
new_df_ids = pd.concat([new_df_ids, ids], axis=1)
new_df_ids # 体裁整える前


Unnamed: 0,index,Style,StyleID,0
0,2,American IPA,7,1
1,5,American Pale Ale,10,2
2,6,Imperial IPA,86,3
3,16,Saison,134,4
4,40,Blonde Ale,30,5
5,45,American Brown Ale,6,6
6,48,American Amber Ale,4,7
7,83,American Stout,12,8
8,122,Irish Red Ale,92,9
9,221,American Light Lager,9,10


In [7]:
new_df_ids = new_df_ids.iloc[:,[1,3]]
new_df_ids.columns = ['Style', 'class_ids']
new_df_ids.to_csv('../data/class_ids.csv',index=None)


In [8]:
beer_dict = {}
for i in range(10):
    style_name = new_df_ids.loc[i,'Style']
    style_id = new_df_ids.loc[i, 'class_ids']
    beer_dict[style_name] = style_id
    
beer_dict

{'American IPA': 1,
 'American Pale Ale': 2,
 'Imperial IPA': 3,
 'Saison': 4,
 'Blonde Ale': 5,
 'American Brown Ale': 6,
 'American Amber Ale': 7,
 'American Stout': 8,
 'Irish Red Ale': 9,
 'American Light Lager': 10}

In [9]:
# 今回のモデリングに使用しないカラムを削除
USE_COLUMNS = ['Style', 'Size(L)', 'OG', 'FG',
       'ABV', 'IBU', 'Color', 'BoilSize', 'BoilTime', 'BoilGravity',
       'Efficiency', 'MashThickness', 'SugarScale', 'BrewMethod', 'PitchRate',
       'PrimaryTemp', 'PrimingMethod', 'PrimingAmount',]

new_df.loc[:,'Style'] = new_df.loc[:,'Style'].map(beer_dict)
new_df = new_df.loc[:, USE_COLUMNS]
new_df = new_df.rename(columns={'Style': 'class'})
new_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,class,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount
2,1,18.93,1.063,1.018,5.91,59.25,8.98,22.71,60,,70.0,,Specific Gravity,extract,,,,
3,1,22.71,1.061,1.017,5.8,54.48,8.5,26.5,60,,70.0,,Specific Gravity,All Grain,,,,
5,2,24.61,1.055,1.013,5.58,40.12,8.0,29.34,70,1.047,79.0,,Specific Gravity,All Grain,1.0,,,
6,3,22.71,1.072,1.018,7.09,268.71,6.33,30.28,90,,75.0,,Specific Gravity,All Grain,,,,
9,3,20.82,1.08,1.017,8.22,93.02,8.29,28.39,60,1.058,70.0,,Specific Gravity,All Grain,,21.11,Corn Sugar,4.6 oz / .66 C


In [10]:
df_train, df_test = train_test_split(new_df, test_size=0.2, random_state=0)
df_train.to_csv('../data/train.csv', index=None)
df_test.to_csv('../data/test.csv', index=None)
