# 黄海广教授团队baseline的复现

原方案链接：[天才海神号](https://github.com/fengdu78/tianchi_haiyang/tree/master)

## 导入库

In [2]:
import pandas as pd
import numpy as np
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
!pip install gensim
from gensim.models import Word2Vec
from scipy import sparse
from tqdm import tqdm
import os
import gc
import time
import warnings
warnings.filterwarnings('ignore')

Collecting gensim
  Downloading gensim-4.3.2-cp310-cp310-win_amd64.whl (24.0 MB)
     ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
     --------------------------------------- 0.0/24.0 MB 435.7 kB/s eta 0:00:56
     --------------------------------------- 0.1/24.0 MB 573.4 kB/s eta 0:00:42
     --------------------------------------- 0.1/24.0 MB 585.1 kB/s eta 0:00:41
     --------------------------------------- 0.2/24.0 MB 704.6 kB/s eta 0:00:34
     --------------------------------------- 0.2/24.0 MB 811.5 kB/s eta 0:00:30
     --------------------------------------- 0.3/24.0 MB 951.8 kB/s eta 0:00:25
      --------------------------------------- 0.4/24.0 MB 1.1 MB/s eta 0:00:22
      --------------------------------------- 0.5/24.0 MB 1.3 MB/s eta 0:00:19
     - -------------------------------------- 0.7/24.0 MB 1.5 MB/s eta 0:00:16
     - -------------------------------------- 0.9/24.0 MB 

## 预处理

为方便处理，将原始数据的中文映射到数字和英文

In [3]:
label_dict1 = {'拖网': 0, '围网': 1, '刺网': 2}
label_dict2 = {0: '拖网', 1: '围网', 2: '刺网'}
name_dict = {'渔船ID': 'id', '速度': 'v', '方向': 'dir', 'type': 'label', 'lat': 'x', 'lon': 'y'}

### 读取数据

In [4]:
def get_data(file_path, model):
    paths = os.listdir(file_path)
    tmp = open(f'{model}.csv', 'w', encoding='utf-8')
    for t in tqdm(range(len(paths))):
        p = paths[t]
        with open(f'{file_path}/{p}', encoding='utf-8') as f:
            if t!=0:
                next(f)
            tmp.write(f.read())
    tmp.close()

In [5]:
ttt = time.time()

In [6]:
get_data('../data/hy_round1_train_20200102', 'train')
get_data('../data/hy_round1_testA_20200102', 'testA')
get_data('../data/hy_round1_testB_20200221', 'testB')

100%|██████████| 7000/7000 [00:31<00:00, 224.41it/s]
100%|██████████| 2000/2000 [00:09<00:00, 212.79it/s]
100%|██████████| 2000/2000 [00:09<00:00, 201.35it/s]


In [7]:
train = pd.read_csv('train.csv')
train['flag'] = 0
train['trn'] = 1
test = pd.read_csv('testB.csv')
test['flag'] = 0
test['trn'] = 0
testA = pd.read_csv('testA.csv')
testA['flag'] = 1
testA['trn'] = 0

记录读取数据时间

In [8]:
print(time.time() - ttt)

59.89835071563721


将原始数据的中文映射到数字和英文

In [10]:
train.rename(columns = name_dict, inplace = True)
test.rename(columns = name_dict, inplace = True)
testA.rename(columns = name_dict, inplace = True)

In [11]:
df = pd.concat([train, testA, test], axis=0, ignore_index=True)
df['x'] = df['x'] * 100000 - 5630000
df['y'] = df['y'] * 110000 + 2530000
df['time'] = pd.to_datetime(df['time'].apply(lambda x :'2019-'+ x[:2] + '-' + x[2:4] + ' ' + x[5:]))
df = df.sort_values(['id', 'time']).reset_index(drop=True)
df['label'] = df['label'].map(label_dict1)
df.loc[df['trn'] == 0, 'label'] = -1

In [12]:
df.head()

Unnamed: 0,id,x,y,v,dir,time,label,flag,trn
0,0,611829500000.0,564376500000.0,0.0,0,2019-11-07 12:09:28,0.0,0,1
1,0,611829500000.0,564376500000.0,0.0,0,2019-11-07 12:18:30,0.0,0,1
2,0,611829500000.0,564376500000.0,0.0,0,2019-11-07 12:28:32,0.0,0,1
3,0,611829500000.0,564376500000.0,0.0,0,2019-11-07 12:38:32,0.0,0,1
4,0,611829500000.0,564376500000.0,0.0,0,2019-11-07 12:48:30,0.0,0,1


记录数据预处理时间

In [13]:
print(time.time() - ttt)

147.9416162967682


In [14]:
df['v_bin'] = pd.qcut(df['v'], 200, duplicates='drop')
df['v_bin'] = df['v_bin'].map(dict(zip(df['v_bin'].unique(), range(df['v_bin'].nunique()))))
for f in ['x', 'y']:
    df[f + '_bin1'] = pd.qcut(df[f], 1000, duplicates='drop')
    df[f + '_bin1'] = df[f + '_bin1'].map(dict(zip(df[f + '_bin1'].unique(), range(df[f + '_bin1'].nunique()))))
    df[f + '_bin2'] = df[f] // 10000
    df[f + '_bin1_count'] = df[f + '_bin1'].map(df[f + '_bin1'].value_counts())
    df[f + '_bin2_count'] = df[f + '_bin2'].map(df[f + '_bin2'].value_counts())
    df[f + '_bin1_id_nunique'] = df.groupby(f + '_bin1')['id'].transform('nunique')
    df[f + '_bin2_id_nunique'] = df.groupby(f + '_bin2')['id'].transform('nunique')
for i in [1, 2]:
    df['x_y_bin{}'.format(i)] = df['x_bin{}'.format(i)].astype('str') + '_' + df['y_bin{}'.format(i)].astype('str')
    df['x_y_bin{}'.format(i)] = df['x_y_bin{}'.format(i)].map(
        dict(zip(df['x_y_bin{}'.format(i)].unique(), range(df['x_y_bin{}'.format(i)].nunique())))
    )
    df['x_bin{}_y_bin{}_count'.format(i, i)] = df['x_y_bin{}'.format(i)].map(df['x_y_bin{}'.format(i)].value_counts())
for stat in ['max', 'min']:
    df['x_y_{}'.format(stat)] = df['y'] - df.groupby('x_bin1')['y'].transform(stat)
    df['y_x_{}'.format(stat)] = df['x'] - df.groupby('y_bin1')['x'].transform(stat)

In [15]:
print(time.time() - ttt)

184.9853811264038


In [16]:
g = df.groupby('id')
for f in ['x', 'y']:
    df[f + '_prev_diff'] = df[f] - g[f].shift(1)
    df[f + '_next_diff'] = df[f] - g[f].shift(-1)
    df[f + '_prev_next_diff'] = g[f].shift(1) - g[f].shift(-1)
df['dist_move_prev'] = np.sqrt(np.square(df['x_prev_diff']) + np.square(df['y_prev_diff']))
df['dist_move_next'] = np.sqrt(np.square(df['x_next_diff']) + np.square(df['y_next_diff']))
df['dist_move_prev_next'] = np.sqrt(np.square(df['x_prev_next_diff']) + np.square(df['y_prev_next_diff']))
df['dist_move_prev_bin'] = pd.qcut(df['dist_move_prev'], 50, duplicates='drop')
df['dist_move_prev_bin'] = df['dist_move_prev_bin'].map(
    dict(zip(df['dist_move_prev_bin'].unique(), range(df['dist_move_prev_bin'].nunique())))
)

In [17]:
print(time.time() - ttt)

191.05742287635803


In [18]:
def get_loc_list(x):
    prev = ''
    res = []
    for loc in x:
        loc = str(loc)
        if loc != prev:
            res.append(loc)
        prev = loc
    return res


In [20]:
size = 10
sentence = df.groupby('id')['x_y_bin1'].agg(get_loc_list).tolist()
model = Word2Vec(sentence, vector_size=size, window=20, min_count=1, sg=1, workers=12, epochs=10)
emb = []
for w in df['x_y_bin1'].unique():
    vec = [w]
    try:
        vec.extend(model[str(w)])
    except:
        vec.extend(np.ones(size) * -size)
    emb.append(vec)
emb_df = pd.DataFrame(emb)
emb_cols = ['x_y_bin1']
for i in range(size):
    emb_cols.append('x_y_bin1_emb_{}'.format(i))
emb_df.columns = emb_cols

In [21]:
print(time.time() - ttt)

284.3631534576416


In [22]:
def start(x):
    try:
        return x[0]
    except:
        return None


In [23]:
def end(x):
    try:
        return x[-1]
    except:
        return None


In [24]:
def mode(x):
    try:
        return pd.Series(x).value_counts().index[0]
    except:
        return None


In [25]:
df = df[df['flag'] == 0].reset_index(drop=True)
for f in ['dist_move_prev_bin', 'v_bin']:
    df[f + '_sen'] = df['id'].map(df.groupby('id')[f].agg(lambda x: ','.join(x.astype(str))))
g = df.groupby('id').agg({
    'id': ['count'], 'x_bin1': [mode], 'y_bin1': [mode], 'x_bin2': [mode], 'y_bin2': [mode], 'x_y_bin1': [mode],
    'x': ['mean', 'max', 'min', 'std', np.ptp, start, end],
    'y': ['mean', 'max', 'min', 'std', np.ptp, start, end],
    'v': ['mean', 'max', 'min', 'std', np.ptp], 'dir': ['mean'],
    'x_bin1_count': ['mean'], 'y_bin1_count': ['mean', 'max', 'min'],
    'x_bin2_count': ['mean', 'max', 'min'], 'y_bin2_count': ['mean', 'max', 'min'],
    'x_bin1_y_bin1_count': ['mean', 'max', 'min'],
    'dist_move_prev': ['mean', 'max', 'std', 'min', 'sum'],
    'x_y_min': ['mean', 'min'], 'y_x_min': ['mean', 'min'],
    'x_y_max': ['mean', 'min'], 'y_x_max': ['mean', 'min'],
}).reset_index()
g.columns = ['_'.join(col).strip() for col in g.columns]
g.rename(columns={'id_': 'id'}, inplace=True)
cols = [f for f in g.keys() if f != 'id']

In [26]:
print(time.time() - ttt)

311.1158809661865


In [27]:
df = df.drop_duplicates('id')[['id', 'label', 'dist_move_prev_bin_sen', 'v_bin_sen']].sort_values('id').reset_index(drop=True)
df = df.sort_values('label').reset_index(drop=True)
sub = df[df['label'] == -1].reset_index(drop=True)[['id']]
test_num = sub.shape[0]
labels = df[df['label'] != -1]['label'].values
df = df.merge(g, on='id', how='left')
df[cols] = df[cols].astype('float32')
df['dist_total'] = np.sqrt(np.square(df['x_end'] - df['y_start']) + np.square(df['y_end'] - df['y_start']))
df['dist_rate'] = df['dist_total'] / (df['dist_move_prev_sum'] + 1e-8)
df = df.merge(emb_df, left_on='x_y_bin1_mode', right_on='x_y_bin1', how='left')
df_values = sparse.csr_matrix(df[cols + emb_cols[1:] + ['dist_total', 'dist_rate']].values)
for f in ['dist_move_prev_bin_sen', 'v_bin_sen']:
    cv = CountVectorizer(min_df=10).fit_transform(df[f].values)
    df_values = sparse.hstack((df_values, cv), 'csr')
test_values, train_values = df_values[:test_num], df_values[test_num:]
del df, df_values
gc.collect()

4

In [28]:
print(time.time() - ttt)

319.92077565193176


In [29]:
def f1(y_true, y_pred):
    y_pred = np.transpose(np.reshape(y_pred, [3, -1]))
    return 'f1', f1_score(y_true, np.argmax(y_pred, axis=1), average='macro'), True


In [30]:
print(train_values.shape, test_values.shape)

(7000, 149) (2000, 149)


In [31]:
test_pred = np.zeros((test_values.shape[0], 3))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
clf = LGBMClassifier(
    learning_rate=0.05,
    n_estimators=20000,
    num_leaves=63,
    subsample_freq=1,
    subsample=0.9,
    colsample_bytree=0.4,
    min_child_samples=10,
    random_state=2020,
    class_weight='balanced',
    metric='None'
)

In [35]:
%pip install --upgrade lightgbm
for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, labels)):
    trn_x, trn_y = train_values[trn_idx], labels[trn_idx]
    val_x, val_y = train_values[val_idx], labels[val_idx]
    clf.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        eval_metric=f1,
        # early_stopping_rounds=100,
        # verbose=100
    )
    test_pred += clf.predict_proba(test_values) / skf.n_splits


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15033
[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 133
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15025
[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 133
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


KeyboardInterrupt: 

In [36]:
sub['id'] = sub['id'].astype('int32')
sub['label'] = np.argmax(test_pred, axis=1)
sub['label'] = sub['label'].map(label_dict2)
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('result.csv', index=False, header=False)