In [5]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
warnings.filterwarnings('ignore')

In [6]:
import os
path = '/home/wjunneng/Ubuntu/2019-CCF-Prediction-Of-The-Quality-Compliance-Rate-Of-Workpieces-In-Discrete-Manufacturing-Processes/'
train = pd.read_csv(path + '/data/original/first_round_training_data.csv')
test = pd.read_csv(path + '/data/original/first_round_testing_data.csv')

In [7]:
len(train),len(test)

(6000, 6000)

In [8]:
features = [c for c in test.columns if c!='Group']
cat_feats = ['Attribute4',
'Attribute5',
'Attribute6',
'Attribute7',
'Attribute8',
'Attribute9',
'Attribute10',
'Parameter5',
'Parameter6',
'Parameter7',
'Parameter8',
'Parameter9',
'Parameter10']

use_cate = [c for c in cat_feats if 'Para' in c]
col_only_train = [c for c in test.columns if c != 'Group']
all_feat = [c for c in train.columns if c != 'Quality_label']
data = pd.concat([train,test])

def nnq_encode(data,en_col,use_col):
    data[en_col + '_nnq_of_' + use_col] = data[en_col].map( data.groupby([en_col])[use_col].nunique() )
    features.append( en_col + '_nnq_of_' + use_col )
    return data

for en_col in use_cate:
    for use_col in cat_feats:
        if en_col != use_col:
            data = nnq_encode(data,en_col,use_col)
for en_col in use_cate:
    for use_col in use_cate:
        if en_col != use_col:
            colname =  en_col +'_count_' + use_col
            features.append( colname)
            data[ colname] = data[en_col].astype(str) + "|" + data[use_col].astype(str)
            data[ colname] = data.groupby([ colname ])[colname].transform('count')
            
for en_col in use_cate:
    for use_col in all_feat:
        if en_col != use_col:
            colname =  en_col +'_mean_' + use_col
            features.append( colname)
            data[ colname] = data[en_col].map( data.groupby([en_col])[use_col].mean() )
            colname =  en_col +'_std_' + use_col
            features.append( colname)
            data[ colname] = data[en_col].map( data.groupby([en_col])[use_col].std() )

classMap = {'Excellent':0,
'Good':1,
'Pass':2,
'Fail':3,}
tr_index = ~data.Quality_label.isnull()
train_df = data[tr_index][features +['Quality_label']].reset_index(drop=True)
train_df['Quality_label'] = train_df['Quality_label'].map(classMap)

test_df = data[~tr_index].reset_index(drop=True)
id_test = test_df.Group.values


X_train = train_df[features]    
y = train_df.Quality_label
X_test = test_df[features]
del train,test
del train_df,test_df

lgb_paras = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'learning_rate': 0.05,
        'num_leaves': 32,
        'num_class': 4,
        'max_depth': -1,
        'seed': 42,
        'feature_fraction': 0.8,
        'verbose': 1
    }
from sklearn.metrics import f1_score, confusion_matrix
all_preads = []
skf = StratifiedKFold(n_splits=5, random_state=2029, shuffle=True)
def f1_weighted(preds, train_data):
    y_true = train_data.label
    preds = np.argmax(preds.reshape(4, -1), axis=0)
    score = f1_score(y_true, preds, average='weighted')
    return 'f1_weighted', score, True

evals_result = {}
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    train_x, test_x, train_y, test_y = X_train[features].iloc[train_index], X_train[features].iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    dtrain = lgb.Dataset(train_x, train_y)

    dvalid = lgb.Dataset(test_x, test_y)
    
    lgb_modelall = lgb.train(lgb_paras, dtrain,
                             valid_sets=[dtrain, dvalid],
                             num_boost_round=1000,
                             early_stopping_rounds=100,
                             valid_names=["train", "valid"],
                             evals_result=evals_result,
                             verbose_eval=50,
                             feval=f1_weighted)
    pred = lgb_modelall.predict(X_test)
    all_preads.append( pred )
    
mean_pread = np.mean(all_preads,axis=0)
cols = ['Excellent ratio','Good ratio','Pass ratio','Fail ratio']

sub_prob  = pd.DataFrame(mean_pread,columns=cols)
len(sub_prob),len(id_test)
sub_prob['Group'] = id_test

sub_prob['Group'] = sub_prob['Group'].map(int)
sub_prob = sub_prob.groupby([ 'Group'])[cols].median().reset_index()

sub_prob.to_csv('../submission/lgb_fisrt.csv',index=False)

KeyboardInterrupt: 

In [25]:
all_preads[0][0]

array([0.19381562, 0.16818632, 0.20587343, 0.43212463])