In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [2]:
trn_path = 'hy_round1_train_20200102'
test_path = 'hy_round1_testA_20200102'

In [3]:
def get_data(path, get_type=True):
    features = []
    for file in tqdm(os.listdir(path)):
        file_path = os.path.join(path, file)
        df = pd.read_csv(file_path)
        if get_type:
            features.append([df['x'].std(), df['x'].mean(),
                             df['y'].std(), df['y'].mean(),
                             df['速度'].mean(), df['速度'].std(), 
                             df['方向'].mean(), df['方向'].std(),
                             file,
                             df['type'][0]])
        else:
            features.append([df['x'].std(), df['x'].mean(),
                             df['y'].std(), df['y'].mean(),
                             df['速度'].mean(), df['速度'].std(), 
                             df['方向'].mean(), df['方向'].std(),
                             file])
    df = pd.DataFrame(features)
    if get_type:
        df = df.rename(columns={len(features[0])-1:'label'})
        df = df.rename(columns={len(features[0])-2:'filename'})
        label_dict = {'拖网':0, '刺网':1, '围网':2}
        df['label'] = df['label'].map(label_dict)
    else:
        df = df.rename(columns={len(features[0])-1:'filename'})
    

    return df
df_train = get_data(trn_path)
df_test = get_data(test_path, False)

100%|██████████| 7000/7000 [00:41<00:00, 166.70it/s]
100%|██████████| 2000/2000 [00:12<00:00, 163.77it/s]


In [4]:
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_lgb = np.zeros((len(df_train),3))
col = [tmp_col for tmp_col in df_train.columns if tmp_col not in ['label', 'filename']]
X_train = df_train[col].values
y_train = df_train['label'].values
prediction = np.zeros((len(df_test),3))

In [5]:
param = {'num_leaves': 31,
#          'min_data_in_leaf': 30, 
         'objective':'multiclassova',
         'num_class':3,
         'learning_rate': 0.01,
#          "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
#          "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
#          "bagging_seed": 11,
#          "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 2000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    prediction += clf.predict(df_test[col].values, num_iteration=clf.best_iteration)
#     oof_lgb2 += clf.predict(df_remove[col].values, num_iteration=clf.best_iteration)
oof_lgb_final = np.argmax(oof_lgb, axis=1)  
#     predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

fold n°1
Training until validation scores don't improve for 100 rounds.
[500]	training's multi_logloss: 0.226924	valid_1's multi_logloss: 0.326774
[1000]	training's multi_logloss: 0.145244	valid_1's multi_logloss: 0.313612
Early stopping, best iteration is:
[1324]	training's multi_logloss: 0.111339	valid_1's multi_logloss: 0.311507
fold n°2
Training until validation scores don't improve for 100 rounds.
[500]	training's multi_logloss: 0.225747	valid_1's multi_logloss: 0.352029
[1000]	training's multi_logloss: 0.142032	valid_1's multi_logloss: 0.336953
Early stopping, best iteration is:
[1221]	training's multi_logloss: 0.117995	valid_1's multi_logloss: 0.333692
fold n°3
Training until validation scores don't improve for 100 rounds.
[500]	training's multi_logloss: 0.228576	valid_1's multi_logloss: 0.339163
[1000]	training's multi_logloss: 0.146269	valid_1's multi_logloss: 0.322189
Early stopping, best iteration is:
[1149]	training's multi_logloss: 0.129774	valid_1's multi_logloss: 0.32012

In [6]:
f1_score(y_train, oof_lgb_final, average='macro')

0.8356443698590562

In [7]:
pred_label = np.argmax(prediction, axis=1)
label_dict = {0:'拖网', 1:'刺网', 2:'围网'}
df_test['filename'] = df_test['filename'].apply(lambda x:x[0:4])
df_pred = pd.DataFrame()
df_pred['filename'] = df_test['filename']
df_pred['label'] = pred_label
df_pred['label'] = df_pred['label'].map(label_dict)
df_pred.to_csv('sub2.csv', index=None, header=False)    