In [2]:
# load package
import numpy as np
import pandas as pd
import jieba
from collections import Counter
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation  import train_test_split
from sklearn import metrics
from sklearn import cross_validation

# set path
import os
default_path = "/Users/francislin/jrml/What's Cooking/"
os.chdir(default_path)

# skip interaction warning
import warnings; 
warnings.filterwarnings('ignore')
#你可以选择修改 ast_note_iteractively kernel 选项来使得 Jupyter 为每一行的变量或语句执行这个操作，以便你可以立即看到多条语句一起输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [3]:
train=pd.read_json('./data/train.json')
test=pd.read_json('./data/test.json')
submission=pd.read_csv('./data/sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [5]:
test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [6]:
df=train.append(test, ignore_index=True)
del (train, test)

In [7]:
content = list(df['ingredients'])
contenttxt = ''.join(str(e) for e in content)
#把全部要分析的ingredients灌進一個string

In [8]:
#把結巴沒斷好的拿掉
garbage = ['teeeest']

In [9]:
# 計算詞頻
def get_words(txt, num=350):
    seg_list = jieba.cut(txt, cut_all=False)
    temp = list(seg_list)
    temp = [x.strip() for x in temp]
    temp = [x for x in temp if x not in garbage]
    c = Counter()
    for x in temp:
        if len(x)>1 and x != '\r\n':
            c[x] += 1
    return c.most_common(num)

In [10]:
frq = get_words(contenttxt)
print(frq)
print(type(frq))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/ps/4m0fjtss0nv0lbhy26n3224m0000gn/T/jieba.cache
Loading model cost 0.806 seconds.
Prefix dict has been built succesfully.


[('pepper', 31909), ('salt', 30573), ('oil', 29048), ('garlic', 23614), ('ground', 22836), ('fresh', 22161), ('sauce', 15987), ('sugar', 15621), ('onions', 15450), ('cheese', 14638), ('chicken', 14300), ('olive', 13539), ('black', 13465), ('water', 12219), ('red', 11525), ('flour', 11082), ('butter', 10804), ('tomatoes', 10717), ('green', 10643), ('powder', 10302), ('chopped', 9802), ('cloves', 9422), ('juice', 8970), ('white', 8626), ('onion', 8609), ('eggs', 8170), ('rice', 7661), ('cream', 7628), ('cilantro', 7432), ('milk', 7353), ('lemon', 7288), ('vegetable', 7068), ('leaves', 6837), ('large', 6787), ('ginger', 6732), ('corn', 6672), ('dried', 6647), ('lime', 6199), ('vinegar', 6185), ('soy', 6076), ('all', 6054), ('purpose', 6045), ('cumin', 5809), ('broth', 5556), ('chili', 5406), ('wine', 5334), ('bell', 5313), ('parsley', 5172), ('sesame', 4391), ('beans', 4353), ('grated', 4093), ('kosher', 4012), ('carrots', 4005), ('extra', 3856), ('basil', 3835), ('beef', 3798), ('dry', 3

In [11]:
#計算出用字詞頻，以利未來製作feature標籤
df_frq = pd.DataFrame(frq, columns=["words","frq"])
df_frq

Unnamed: 0,words,frq
0,pepper,31909
1,salt,30573
2,oil,29048
3,garlic,23614
4,ground,22836
5,fresh,22161
6,sauce,15987
7,sugar,15621
8,onions,15450
9,cheese,14638


In [17]:
df_frq.to_csv("df_frq.csv", index= False)

In [12]:
#用結巴出來的結果系統性貼標
for a in df_frq['words']:
    df['used_%s'% (a)]= df['ingredients'].str.contains('%s'% (a),na=False)*1

In [13]:
df.head()

Unnamed: 0,cuisine,id,ingredients,used_pepper,used_salt,used_oil,used_garlic,used_ground,used_fresh,used_sauce,...,used_lasagna,used_vegetables,used_grape,used_sirloin,used_prosciutto,used_whiskey,used_pods,used_monterey,used_beer,used_andouille
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,indian,22213,"[water, vegetable oil, wheat, salt]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
del (df_frq, frq, contenttxt, content)

In [16]:
df=df.drop('ingredients',axis=1)

KeyError: "labels ['ingredients'] not contained in axis"

In [17]:
# 區分train test
dataTrain = df[pd.notnull(df['cuisine'])]
dataTest = df[~pd.notnull(df['cuisine'])]
dataTrain.columns
del(df)

Index(['cuisine', 'id', 'used_pepper', 'used_salt', 'used_oil', 'used_garlic',
       'used_ground', 'used_fresh', 'used_sauce', 'used_sugar',
       ...
       'used_lasagna', 'used_vegetables', 'used_grape', 'used_sirloin',
       'used_prosciutto', 'used_whiskey', 'used_pods', 'used_monterey',
       'used_beer', 'used_andouille'],
      dtype='object', length=352)

In [18]:
#dataTrain = dataTrain
#去掉y
dataTest = dataTest.drop('cuisine',axis=1)
#去掉key
X_feature = dataTrain.drop(['id','cuisine'],axis=1)
y_t =dataTrain['cuisine']

In [19]:
X_feature.head()

Unnamed: 0,used_pepper,used_salt,used_oil,used_garlic,used_ground,used_fresh,used_sauce,used_sugar,used_onions,used_cheese,...,used_lasagna,used_vegetables,used_grape,used_sirloin,used_prosciutto,used_whiskey,used_pods,used_monterey,used_beer,used_andouille
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#把上面dataTrain.columns的結果貼到dataTrain，再去掉'Survived'
from sklearn.model_selection import train_test_split
#拿掉feature_importances低的因子
X_train, X_test, y_train, y_test = train_test_split(X_feature, y_t, test_size=0.3, random_state=13)

In [21]:
lgb_train=lgb.Dataset(X_train, y_train)
lgb_eval=lgb.Dataset(X_test, y_test, reference=lgb_train)

In [22]:
params={
    'task':'train',
    'boosting_type':'gbdt',
    'objective':'multiclass',
    'metric':'multi_logloss',
    'num_class':20,
    'num_leaves':64,
    'learning_rate':0.05,
    'feature_fraction':0.7,
    'bagging_fraction':0.9,
    'n_estimators':10,
    'verbose':0
        
}

print('Staring training...')

gbm=lgb.train(
    params,
    lgb_train,
    num_boost_round=20,
    valid_sets=lgb_eval,
    early_stopping_rounds=5
)

Staring training...


LightGBMError: Cannot construct Dataset since there are not useful features.
It should be at least two unique rows.
If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.
Otherwise please make sure you are using the right dataset