In [1]:
import re
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('../data/train.csv', sep=';', header=None, 
                    names=['fixed acidity','volatile acidity','citric acid',
                           'residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide',
                           'density','pH','sulphates','alcohol','quality'])
columns = train.columns.tolist()
columns.remove('quality')
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.3,0.19,0.27,13.9,0.057,45,155,0.99807,2.94,0.41,8.8,8
1,6.2,0.2,0.49,1.6,0.065,17,143,0.9937,3.22,0.52,9.2,6
2,7.1,0.23,0.35,16.5,0.04,60,171,0.999,3.16,0.59,9.1,6
3,7.5,0.2,0.47,16.9,0.052,51,188,0.99944,3.09,0.62,9.3,5
4,7.0,0.15,0.38,15.3,0.045,54,120,0.9975,3.18,0.42,9.8,6


In [3]:
print(train['quality'].value_counts())
train[train['quality'] == 'quality']

6          1535
5          1024
7           593
8           125
4           104
3            13
9             3
quality       1
Name: quality, dtype: int64


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
629,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality


In [4]:
train.drop(train.index[629], inplace=True)
train['quality'] = train['quality'].astype(int) - 3

In [5]:
test = pd.read_csv('../data/test.csv', sep=';', header=None, 
                    names=['fixed acidity','volatile acidity','citric acid',
                           'residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide',
                           'density','pH','sulphates','alcohol'])
data = pd.concat([train, test], ignore_index=True)
data = data.astype(float)
test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,8.9,0.3,0.35,4.6,0.032,32.0,148.0,0.99458,3.15,0.45,11.5
1,6.9,0.31,0.32,1.2,0.024,20.0,166.0,0.99208,3.05,0.54,9.8
2,6.3,0.19,0.29,2.0,0.022,33.0,96.0,0.98902,3.04,0.54,12.8
3,8.4,0.31,0.31,0.95,0.021,52.0,148.0,0.99038,2.93,0.32,11.5
4,7.0,0.24,0.24,1.8,0.047,29.0,91.0,0.99251,3.3,0.43,9.9


In [6]:
num_labels = len(set(train['quality'].values))
print('num_labels:{}'.format(num_labels))
train['quality'].value_counts()

num_labels:7


3    1535
2    1024
4     593
5     125
1     104
0      13
6       3
Name: quality, dtype: int64

In [7]:
def binning(col, cut_points, labels=None):

    #Define min and max values:

    minval = col.min()

    maxval = col.max()

    #利用最大值和最小值创建分箱点的列表
    print(minval, cut_points, maxval)
    break_points = [minval] + cut_points + [maxval]
    print(break_points)
    #如果没有标签，则使用默认标签0 ... (n-1)

    if not labels:

        labels = range(len(cut_points)+1)

    #使用pandas的cut功能分箱

    colBin = pd.cut(col,bins=break_points,labels=labels,include_lowest=True)

    return colBin

#为年龄分箱:

cut_points = [10, 20, 30, 40, 50, 60]

labels = ["1111","2222","3333", "4444", "5555", '6666', '7']

data["free_sulfur_dioxide_bin"] = binning(data["free sulfur dioxide"], cut_points, labels)

2.0 [10, 20, 30, 40, 50, 60] 289.0
[2.0, 10, 20, 30, 40, 50, 60, 289.0]


In [8]:
object_columns = ['free_sulfur_dioxide_bin']
for col in object_columns:
    data = pd.concat([data, pd.get_dummies(data[col], prefix=col+'_')], axis=1)
    data.drop(col, axis=1, inplace=True)

In [9]:
columns = data.columns.tolist()
columns.remove('density')
columns.remove('free sulfur dioxide')
columns.remove('quality')

In [10]:
# def data_expand(data, label, num):
#     tmp = data[data['quality'] == label]
#     for i in range(num):
#         data = data.append(tmp)
#     return data
# train = data_expand(train, 0, 10)
# train = data_expand(train, 1, 5)
# train = data_expand(train, 5, 3)
# train = data_expand(train, 6, 30)
# train['quality'].value_counts()

In [11]:
# train_expand = train.append(train[train['quality'] == 6])
# x_train = train_expand[columns].astype(float)
# y_train = train_expand['quality']
# from imblearn.over_sampling import SMOTE
# from collections import Counter
# smo = SMOTE(random_state=0)
# x_smo, y_smo = smo.fit_sample(x_train, y_train)
# print(Counter(y_smo))

# y_smo = y_smo.reshape(-1, 1)
# train = np.concatenate((y_smo, x_smo), axis=1)
# train = pd.DataFrame(train, columns=['quality'] + columns)
# train.head()

In [12]:
train = data[data['quality'].notnull()]
test = data[data['quality'].isnull()]

In [13]:
x_train = train[columns].astype(float)
y_train = train['quality']
x_test = test[columns]

params = {
    'num_leaves': 60,
    'min_data_in_leaf': 30,
    'objective': 'multiclass',
    'num_class': num_labels,
    'max_depth': -1,
    'learning_rate': 0.03,
    "min_sum_hessian_in_leaf": 6,
    "boosting": "gbdt",
    "feature_fraction": 0.9,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "bagging_seed": 11,
    "lambda_l1": 0.1,
    "verbosity": -1,
    "nthread": 15,
    'metric': 'multi_logloss',
    "random_state": 0}


folds = KFold(n_splits=5, shuffle=True, random_state=0)
oof = np.zeros((x_train.shape[0], num_labels))
preds_prob = np.zeros((x_test.shape[0], num_labels))

## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train)):
    print("fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(x_train.iloc[trn_idx], label=y_train.iloc[trn_idx])
    val_data = lgb.Dataset(x_train.iloc[val_idx], label=y_train.iloc[val_idx])
    clf = lgb.train(
        params,
        trn_data,
        valid_sets=[trn_data, val_data],
        num_boost_round = 10000,
        verbose_eval = 100,
        early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(x_train.iloc[val_idx], num_iteration=clf.best_iteration)

    preds_prob += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits

fold 1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.708649	valid_1's multi_logloss: 1.18775
[200]	training's multi_logloss: 0.486178	valid_1's multi_logloss: 1.14338
[300]	training's multi_logloss: 0.353185	valid_1's multi_logloss: 1.13421
[400]	training's multi_logloss: 0.266242	valid_1's multi_logloss: 1.13881
Early stopping, best iteration is:
[338]	training's multi_logloss: 0.316226	valid_1's multi_logloss: 1.13385
fold 2
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.734948	valid_1's multi_logloss: 0.958642
[200]	training's multi_logloss: 0.507582	valid_1's multi_logloss: 0.906696
[300]	training's multi_logloss: 0.372489	valid_1's multi_logloss: 0.887596
[400]	training's multi_logloss: 0.284076	valid_1's multi_logloss: 0.88514
[500]	training's multi_logloss: 0.224992	valid_1's multi_logloss: 0.888516
Early stopping, best iteration is:
[417]	training's multi_logloss: 0.272055	valid_1

In [14]:
from sklearn.metrics import accuracy_score
oof = np.argmax(oof, axis=1)
accuracy_score(y_train, oof)

0.6302619958787166

In [15]:
preds_prob1 = preds_prob
preds = np.argmax(preds_prob, axis=1)
submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds+3})
submission['id'] = submission['id'] + 1
submission.to_csv("../data/lgb_submission.csv", index=False, header=False)
submission.head()

Unnamed: 0,id,pred
0,1,7
1,2,6
2,3,6
3,4,5
4,5,5


In [16]:
# import xgboost as xgb
# x_train = train[columns].astype(float)
# y_train = train['quality']
# x_test = test[columns]

# params = {
#     'booster': 'gbtree',
#     'objective': 'multi:softprob',
#     'num_class': num_labels,
#     'eval_metric': 'mlogloss',
#     'gamma': 0.1,
#     'max_depth': 8,
#     'alpha': 0,
#     'lambda': 0,
#     'subsample': 0.7,
#     'colsample_bytree': 0.5,
#     'min_child_weight': 3,
#     'silent': 0,
#     'eta': 0.03,
#     'nthread': -1,
#     'missing': 1,
#     'seed': 0,
# }

# folds = KFold(n_splits=5, shuffle=True, random_state=0)
# oof_prob = np.zeros((x_train.shape[0], num_labels))
# preds_prob = np.zeros((x_test.shape[0], num_labels))

# num_round = 10000
# ## train and predict
# feature_importance_df = pd.DataFrame()
# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train)):
#     print("fold {}".format(fold_ + 1))
#     trn_data = xgb.DMatrix(x_train.iloc[trn_idx], label=y_train.iloc[trn_idx])
#     val_data = xgb.DMatrix(x_train.iloc[val_idx], label=y_train.iloc[val_idx])

#     watchlist = [(trn_data, 'train'), (val_data, 'valid')]
#     clf = xgb.train(params, 
#                     trn_data, 
#                     num_round, 
#                     watchlist, 
#                     verbose_eval=100, 
#                     early_stopping_rounds=100)
# #     fold_importance_df = pd.DataFrame()
# #     fold_importance_df["Feature"] = clf.get_fscore().keys()
# #     fold_importance_df["importance"] = clf.get_fscore().values()
# #     fold_importance_df["fold"] = fold_ + 1
# #     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
#     oof_prob[val_idx] = clf.predict(xgb.DMatrix(x_train.iloc[val_idx]), ntree_limit=clf.best_ntree_limit)
#     preds_prob += clf.predict(xgb.DMatrix(test), ntree_limit=clf.best_ntree_limit) / folds.n_splits

In [17]:
# from sklearn.metrics import accuracy_score
# oof = np.argmax(oof_prob, axis=1)
# accuracy_score(y_train, oof)

In [18]:
# preds_prob2 = preds_prob
# preds = np.argmax(preds_prob, axis=1)
# submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds+3})
# submission['id'] = submission['id'] + 1
# submission.to_csv("../data/xgb_submission.csv", index=False, header=False)
# submission.head()

In [19]:
# 直接相加，结果并不好
# preds_prob = (preds_prob1+preds_prob1)/2
# preds = np.argmax(preds_prob, axis=1)
# submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds+3})
# submission['id'] = submission['id'] + 1
# submission.to_csv("../data/merge_submission.csv", index=False, header=False)
# submission.head()