In [55]:
import gc
import multiprocessing as mp
import os
import pickle
import time
import warnings
from collections import Counter
from copy import deepcopy
from datetime import datetime
from functools import partial
from glob import glob

import geopandas as gpd
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.models import FastText, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pyproj import Proj
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

os.environ['PYTHONHASHSEED'] = '0'
warnings.filterwarnings('ignore')

In [56]:
# 不直接对DataFrame做append操作，提升运行速度
def get_data(file_path,max_lines = 2000):
    paths = os.listdir(file_path)
    tmp = []
    for t in tqdm(range(len(paths))):
        if len(tmp) > max_lines:break
            
        p = paths[t]
        with open('{}/{}'.format(file_path, p), encoding='utf-8') as f:
            next(f)
            for line in f.readlines():
                tmp.append(line.strip().split(','))
                if len(tmp) > max_lines:break
                    
    tmp_df = pd.DataFrame(tmp)
    tmp_df.columns = ['渔船ID', 'x', 'y', '速度', '方向', 'time', 'type']
    return tmp_df

TRAIN_PATH = "../智慧海洋/数据集/hy_round1_train_20200102/"
# 采样数据行数
max_lines = 2000
df = get_data(TRAIN_PATH,max_lines=max_lines)

  0%|                                                                                | 6/7000 [00:00<01:07, 103.73it/s]


In [57]:
# 基本预处理
label_dict1 = {'拖网': 0, '围网': 1, '刺网': 2}
label_dict2 = {0: '拖网', 1: '围网', 2: '刺网'}
name_dict = {'渔船ID': 'id', '速度': 'v', '方向': 'dir', 'type': 'label'}

df.rename(columns = name_dict, inplace = True)
df['label'] = df['label'].map(label_dict1)
cols = ['x','y','v']
for col in cols:
    df[col] = df[col].astype('float')
df['dir'] = df['dir'].astype('int')
df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
df['date'] = df['time'].dt.date
df['hour'] = df['time'].dt.hour
df['month'] = df['time'].dt.month
df['weekday'] = df['time'].dt.weekday
df.head()

Unnamed: 0,id,x,y,v,dir,time,label,date,hour,month,weekday
0,0,6152038.0,5124873.0,2.59,102,1900-11-10 11:58:19,0,1900-11-10,11,11,5
1,0,6151230.0,5125218.0,2.7,113,1900-11-10 11:48:19,0,1900-11-10,11,11,5
2,0,6150421.0,5125563.0,2.7,116,1900-11-10 11:38:19,0,1900-11-10,11,11,5
3,0,6149612.0,5125907.0,3.29,95,1900-11-10 11:28:19,0,1900-11-10,11,11,5
4,0,6148803.0,5126252.0,3.18,108,1900-11-10 11:18:19,0,1900-11-10,11,11,5


In [58]:
df['x_dis_diff'] = (df['x'] - 6165599).abs()
df['y_dis_diff'] = (df['y'] - 5202660).abs()
df['base_dis_diff'] = ((df['x_dis_diff']**2)+(df['y_dis_diff']**2))**0.5    
del df['x_dis_diff'],df['y_dis_diff'] 
df['base_dis_diff'].head()

0    78959.780945
1    78763.845006
2    78577.185266
3    78399.867568
4    78231.955018
Name: base_dis_diff, dtype: float64

In [59]:
df['day_nig'] = 0
df.loc[(df['hour'] > 5) & (df['hour'] < 20),'day_nig'] = 1
df['day_nig'].head()

0    1
1    1
2    1
3    1
4    1
Name: day_nig, dtype: int64

In [60]:
# 季度
df['quarter'] = 0
df.loc[(df['month'].isin([1, 2, 3])), 'quarter'] = 1
df.loc[(df['month'].isin([4, 5, 6, ])), 'quarter'] = 2
df.loc[(df['month'].isin([7, 8, 9])), 'quarter'] = 3
df.loc[(df['month'].isin([10, 11, 12])), 'quarter'] = 4

In [62]:
temp = df.copy()
temp.rename(columns={'id':'ship','dir':'d'},inplace=True)

# 给速度一个等级
def v_cut(v):
    if v < 0.1:
        return 0
    elif v < 0.5:
        return 1
    elif v < 1:
        return 2
    elif v < 2.5:
        return 3
    elif v < 5:
        return 4
    elif v < 10:
        return 5
    elif v < 20:
        return 5
    else:
        return 6
# 统计每个ship的对应速度等级的个数
def get_v_fea(df):

    df['v_cut'] = df['v'].apply(lambda x: v_cut(x))
    tmp = df.groupby(['ship', 'v_cut'], as_index=False)['v_cut'].agg({'v_cut_count': 'count'})
    # 通过pivot构建透视表
    tmp = tmp.pivot(index='ship', columns='v_cut', values='v_cut_count')

    new_col_nm = ['v_cut_' + str(col) for col in tmp.columns.tolist()]
    tmp.columns = new_col_nm
    tmp = tmp.reset_index()  # 把index恢复成data

    return tmp

c1 = get_v_fea(temp)


In [63]:
# 方位进行16均分
def add_direction(df):
    df['d16'] = df['d'].apply(lambda x: int((x / 22.5) + 0.5) % 16 if not np.isnan(x) else np.nan)
    return df
def get_d_cut_count_fea(df):
    df = add_direction(df)
    tmp = df.groupby(['ship', 'd16'], as_index=False)['d16'].agg({'d16_count': 'count'})
    tmp = tmp.pivot(index='ship', columns='d16', values='d16_count')
    new_col_nm = ['d16_' + str(col) for col in tmp.columns.tolist()]
    tmp.columns = new_col_nm
    tmp = tmp.reset_index()
    return tmp

c2 = get_d_cut_count_fea(temp)

In [64]:
def get_v0_fea(df):
    # 统计速度为0的个数，以及速度不为0的统计量
    df_zero_count = df.query("v==0")[['ship', 'v']].groupby('ship', as_index=False)['v'].agg(
        {'num_zero_v': 'count'})
    df_not_zero_agg = df.query("v!=0")[['ship', 'v']].groupby('ship', as_index=False)['v'].agg(
        {'v_max_drop_0': 'max',
         'v_min_drop_0': 'min',
         'v_mean_drop_0': 'mean',
         'v_std_drop_0': 'std',
         'v_median_drop_0': 'median',
         'v_skew_drop_0': 'skew'})
    tmp = df_zero_count.merge(df_not_zero_agg, on='ship', how='left')

    return tmp

c3 = get_v0_fea(temp)

In [65]:
def get_percentiles_fea(df_raw):
    key = ['x', 'y', 'v', 'd']
    temp = df_raw[['ship']].drop_duplicates('ship')
    for i in range(len(key)):
        # 加入x，v，d，y的中位数和各种位数
        tmp_dscb = df_raw.groupby('ship')[key[i]].describe(
            percentiles=[0.05] + [ii / 1000 for ii in range(125, 1000, 125)] + [0.95])
        raw_col_nm = tmp_dscb.columns.tolist()
        new_col_nm = [key[i] + '_' + col for col in raw_col_nm]
        tmp_dscb.columns = new_col_nm
        tmp_dscb = tmp_dscb.reset_index()
        # 删掉多余的统计特征
        tmp_dscb = tmp_dscb.drop([f'{key[i]}_count', f'{key[i]}_mean', f'{key[i]}_std',
                                  f'{key[i]}_min', f'{key[i]}_max'], axis=1)

        temp = temp.merge(tmp_dscb, on='ship', how='left')
    return temp

c4 = get_percentiles_fea(temp)

In [66]:
def get_d_change_rate_fea(df):
    import math
    import time
    temp = df.copy()
    # 以ship、time为主键进行排序
    temp.sort_values(['ship', 'time'], ascending=True, inplace=True)
    # 通过shift求相邻差异值，注意学习.shift(-1,1)的含义
    # https://www.cnblogs.com/anovana/p/10429284.html
    temp['timenext'] = temp.groupby('ship')['time'].shift(-1)  
    temp['ynext'] = temp.groupby('ship')['y'].shift(-1)        
    temp['xnext'] = temp.groupby('ship')['x'].shift(-1)       
    # 将shift得到的差异量进行填充，为什么会有空值NaN？
    # 因为shift的起始位置是没法比较的，故用空值来代替   ？
    temp['ynext'] = temp['ynext'].fillna(method='ffill')     
    temp['xnext'] = temp['xnext'].fillna(method='ffill')       
    # 这里笔者的理解是ynext/xnext，而不需要减去y和x，因为ynext和xnext本身就是偏移量了
    temp['angle_next'] = (temp['ynext'] - temp['y']) / (temp['xnext'] - temp['x'])
    temp['angle_next'] = np.arctan(temp['angle_next']) / math.pi * 180  # 多少度
    temp['angle_next_next'] = temp['angle_next'].shift(-1)     
    temp['timediff'] = np.abs(temp['timenext'] - temp['time']) 
    temp['timediff'] = temp['timediff'].fillna(method='ffill')  
    temp['hc_xy'] = abs(temp['angle_next_next'] - temp['angle_next'])  
    # 对于hc_xy这列的值>180度的，进行修改成360度求差，仅考虑与水平线的角度  可以使用query匹配  
    # 有点像时间序列分析二阶差分 
    temp.loc[temp['hc_xy'] > 180, 'hc_xy'] = (360 - temp.loc[temp['hc_xy'] > 180, 'hc_xy'])
    temp['hc_xy_s'] = temp.apply(lambda x: x['hc_xy'] / x['timediff'].total_seconds(), axis=1) 

    temp['d_next'] = temp.groupby('ship')['d'].shift(-1)
    temp['hc_d'] = abs(temp['d_next'] - temp['d'])
    temp.loc[temp['hc_d'] > 180, 'hc_d'] = 360 - temp.loc[temp['hc_d'] > 180, 'hc_d']
    temp['hc_d_s'] = temp.apply(lambda x: x['hc_d'] / x['timediff'].total_seconds(), axis=1)     

    temp1 = temp[['ship', 'hc_xy_s', 'hc_d_s']]
    xy_d_rate = temp1.groupby('ship')['hc_xy_s'].agg([('hc_xy_s_max','max')]) 
    xy_d_rate = xy_d_rate.reset_index()
    d_d_rate = temp1.groupby('ship')['hc_d_s'].agg([('hc_d_s_max','max')])  
    d_d_rate = d_d_rate.reset_index()
   
    tmp = xy_d_rate.merge(d_d_rate, on='ship', how='left')                   
    return tmp

c5 = get_d_change_rate_fea(temp)

In [67]:
f1 = temp.merge(c1,on='ship',how='left')
f1 = f1.merge(c2,on='ship',how='left')
f1 = f1.merge(c3,on='ship',how='left')
f1 = f1.merge(c4,on='ship',how='left')
f1 = f1.merge(c5,on='ship',how='left')
f1   # 速度等级 方向等级  等等

Unnamed: 0,ship,x,y,v,d,time,label,date,hour,month,...,d_12.5%,d_25%,d_37.5%,d_50%,d_62.5%,d_75%,d_87.5%,d_95%,hc_xy_s_max,hc_d_s_max
0,0,6.152038e+06,5.124873e+06,2.59,102,1900-11-10 11:58:19,0,1900-11-10,11,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183673,0.188020
1,0,6.151230e+06,5.125218e+06,2.70,113,1900-11-10 11:48:19,0,1900-11-10,11,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183673,0.188020
2,0,6.150421e+06,5.125563e+06,2.70,116,1900-11-10 11:38:19,0,1900-11-10,11,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183673,0.188020
3,0,6.149612e+06,5.125907e+06,3.29,95,1900-11-10 11:28:19,0,1900-11-10,11,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183673,0.188020
4,0,6.148803e+06,5.126252e+06,3.18,108,1900-11-10 11:18:19,0,1900-11-10,11,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183673,0.188020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,1001,6.246323e+06,5.241154e+06,0.11,0,1900-11-17 09:43:41,0,1900-11-17,9,11,...,0.0,0.0,10.0,144.0,204.0,271.0,279.0,292.4,0.078424,0.270903
1997,1001,6.246323e+06,5.241154e+06,0.22,10,1900-11-17 09:34:10,0,1900-11-17,9,11,...,0.0,0.0,10.0,144.0,204.0,271.0,279.0,292.4,0.078424,0.270903
1998,1001,6.246323e+06,5.241154e+06,0.11,0,1900-11-17 09:23:39,0,1900-11-17,9,11,...,0.0,0.0,10.0,144.0,204.0,271.0,279.0,292.4,0.078424,0.270903
1999,1001,6.246323e+06,5.241154e+06,0.11,287,1900-11-17 09:13:40,0,1900-11-17,9,11,...,0.0,0.0,10.0,144.0,204.0,271.0,279.0,292.4,0.078424,0.270903


In [68]:
pre_cols = df.columns


df['v_bin'] = pd.qcut(df['v'], 200, duplicates='drop')
df['v_bin'] = df['v_bin'].map(dict(zip(df['v_bin'].unique(), range(df['v_bin'].nunique()))))
for f in ['x', 'y']:
    df[f + '_bin1'] = pd.qcut(df[f], 1000, duplicates='drop')
    df[f + '_bin1'] = df[f + '_bin1'].map(dict(zip(df[f + '_bin1'].unique(), range(df[f + '_bin1'].nunique()))))
    df[f + '_bin2'] = df[f] // 10000 # 取整操作
    df[f + '_bin1_count'] = df[f + '_bin1'].map(df[f + '_bin1'].value_counts()) 
    df[f + '_bin2_count'] = df[f + '_bin2'].map(df[f + '_bin2'].value_counts()) 
    df[f + '_bin1_id_nunique'] = df.groupby(f + '_bin1')['id'].transform('nunique')
    df[f + '_bin2_id_nunique'] = df.groupby(f + '_bin2')['id'].transform('nunique')  
    #  https://www.icode9.com/content-1-493366.html  就是他这个id分组里面有多少个不同的
for i in [1, 2]:
    # 特征交叉x_bin1（2）,y_bin1（2） 形成类别 统计每类数量映射到列  
    df['x_y_bin{}'.format(i)] = df['x_bin{}'.format(i)].astype('str') + '_' + df['y_bin{}'.format(i)].astype('str')
    df['x_y_bin{}'.format(i)] = df['x_y_bin{}'.format(i)].map(
        dict(zip(df['x_y_bin{}'.format(i)].unique(), range(df['x_y_bin{}'.format(i)].nunique())))
    )  # 数量映射编码
    df['x_bin{}_y_bin{}_count'.format(i, i)] = df['x_y_bin{}'.format(i)].map(df['x_y_bin{}'.format(i)].value_counts())  
for stat in ['max', 'min']:
    # 统计x_bin1 y_bin1的最大最小值
    df['x_y_{}'.format(stat)] = df['y'] - df.groupby('x_bin1')['y'].transform(stat)
    df['y_x_{}'.format(stat)] = df['x'] - df.groupby('y_bin1')['x'].transform(stat)

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()

Unnamed: 0,v_bin,x_bin1,x_bin2,x_bin1_count,x_bin2_count,x_bin1_id_nunique,x_bin2_id_nunique,y_bin1,y_bin2,y_bin1_count,...,y_bin1_id_nunique,y_bin2_id_nunique,x_y_bin1,x_bin1_y_bin1_count,x_y_bin2,x_bin2_y_bin2_count,x_y_max,y_x_max,x_y_min,y_x_min
0,0.0,0,615.0,116,8,2,2,0,512.0,2,...,2,1,0,1,0,3,-115954.675157,0.0,0.0,49790.10676
1,0.0,1,615.0,2,8,2,2,1,512.0,2,...,1,1,1,1,0,3,0.0,0.0,53070.048324,808.872353
2,0.0,2,615.0,2,8,2,2,1,512.0,2,...,1,1,2,1,0,3,0.0,-808.872353,54707.512092,0.0
3,1.0,3,614.0,2,77,2,2,2,512.0,2,...,1,1,3,1,1,8,0.0,0.0,52951.29312,808.787673
4,2.0,4,614.0,2,77,2,2,2,512.0,2,...,1,1,4,1,1,8,0.0,-808.787673,55461.653028,0.0


In [69]:
def traj_to_bin(traj=None, x_min=12031967.16239096, x_max=14226964.881853,
                y_min=1623579.449434373, y_max=4689471.1780792,
                row_bins=4380, col_bins=3136):

    # Establish bins on x direction and y direction
    x_bins = np.linspace(x_min, x_max, endpoint=True, num=col_bins + 1) 
    y_bins = np.linspace(y_min, y_max, endpoint=True, num=row_bins + 1) 

    # Determine each x coordinate belong to which bin
    traj.sort_values(by='x', inplace=True)  # 排序
    x_res = np.zeros((len(traj), ))         # array([0., 0., 0., ..., 0., 0., 0.])
    j = 0
    for i in range(1, col_bins + 1):        # 1,2,3,4,5,6,,...,col_bins 
        low, high = x_bins[i-1], x_bins[i]
        while( j < len(traj)):
            # low - 0.001 for numeric stable.
            if (traj["x"].iloc[j] <= high) & (traj["x"].iloc[j] > low - 0.001): 
                #  假设有上面那种情况出现 x_res[j] 行= 等差数列中哪一行，要么就是0 因为数据乱套最大值最小值都很大，所以全是0 
                x_res[j] = i   
                j += 1
            else:
                break
    traj["x_grid"] = x_res
    traj["x_grid"] = traj["x_grid"].astype(int)  # 2.5 会变成2
    traj["x_grid"] = traj["x_grid"].apply(str)

    # Determine each y coordinate belong to which bin
    traj.sort_values(by='y', inplace=True)
    y_res = np.zeros((len(traj), ))
    j = 0
    for i in range(1, row_bins + 1):
        low, high = y_bins[i-1], y_bins[i]
        while( j < len(traj)):
            # low - 0.001 for numeric stable.
            if (traj["y"].iloc[j] <= high) & (traj["y"].iloc[j] > low - 0.001):
                y_res[j] = i
                j += 1
            else:
                break
    traj["y_grid"] = y_res
    traj["y_grid"] = traj["y_grid"].astype(int)
    traj["y_grid"] = traj["y_grid"].apply(str)

    # Determine which bin each coordinate belongs to.
    traj["no_bin"] = [i + "_" + j for i, j in zip(
        traj["x_grid"].values.tolist(), traj["y_grid"].values.tolist())]  
    traj.sort_values(by='time', inplace=True)
    return traj

bin_size = 800
col_bins = int((14226964.881853 - 12031967.16239096) / bin_size)
row_bins = int((4689471.1780792 - 1623579.449434373) / bin_size)

In [70]:
pre_cols = df.columns
# 特征x_grid,y_grid,no_bin
df = traj_to_bin(df)

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols]     

Unnamed: 0,x_grid,y_grid,no_bin
1606,0,0,0_0
1605,0,0,0_0
1604,0,0,0_0
1603,0,0,0_0
1602,0,0,0_0
...,...,...,...
1988,0,0,0_0
1987,0,0,0_0
1986,0,0,0_0
1985,0,0,0_0


In [71]:
def find_save_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the visit frequency of each bin."""
    #  ["no_bin"]  x属于等差数列中哪一行 y属于等差数列哪一行新坐标结合 时间排序后
    visit_count_df = traj_data_df.groupby(["no_bin"]).count().reset_index()   
    visit_count_df = visit_count_df[["no_bin", "x"]]
    visit_count_df.rename({"x":"visit_count"}, axis=1, inplace=True)
    return visit_count_df

def find_save_unique_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the unique boat visit count of each bin."""
    # 构造一张含有信息的表  并不是在原有的表上修改
    unique_boat_count_df = traj_data_df.groupby(["no_bin"])["id"].nunique().reset_index()
    # 重命名
    unique_boat_count_df.rename({"id":"visit_boat_count"}, axis=1, inplace=True)
    # 合并 
    unique_boat_count_df_save = pd.merge(bin_to_coord_df, unique_boat_count_df,
                                         on="no_bin", how="left")
    return unique_boat_count_df

traj_df = df[["id","x", "y",'time',"no_bin"]]
bin_to_coord_df = traj_df.groupby(["no_bin"]).median().reset_index()
bin_to_coord_df   

Unnamed: 0,no_bin,x,y
0,0_0,6124951.0,5130672.0


In [72]:
pre_cols = df.columns

# DataFrame tmp for finding POIs
visit_count_df = find_save_visit_count_table(
    traj_df, bin_to_coord_df)   
unique_boat_count_df = find_save_unique_visit_count_table(
    traj_df, bin_to_coord_df)

# # 特征'visit_count','visit_boat_count'   把前2个函数所得东西结合起来
df = df.merge(visit_count_df,on='no_bin',how='left')
df = df.merge(unique_boat_count_df,on='no_bin',how='left')

new_cols = [i for i in df.columns if i not in pre_cols]  
df[new_cols].head()

Unnamed: 0,visit_count,visit_boat_count
0,2001,6
1,2001,6
2,2001,6
3,2001,6
4,2001,6


In [78]:
pre_cols = df.columns

g = df.groupby('id')   
for f in ['x', 'y']:
    #对x,y坐标进行时间平移 1 -1 2
    df[f + '_prev_diff'] = df[f] - g[f].shift(1)
    df[f + '_next_diff'] = df[f] - g[f].shift(-1)
    df[f + '_prev_next_diff'] = g[f].shift(1) - g[f].shift(-1)
    ## 三角形求解上时刻1距离  下时刻-1距离 2距离   上一时间  和下一时间距离
df['dist_move_prev'] = np.sqrt(np.square(df['x_prev_diff']) + np.square(df['y_prev_diff']))
df['dist_move_next'] = np.sqrt(np.square(df['x_next_diff']) + np.square(df['y_next_diff']))
# 相隔2个时间段的距离
df['dist_move_prev_next'] = np.sqrt(np.square(df['x_prev_next_diff']) + np.square(df['y_prev_next_diff']))
df['dist_move_prev_bin'] = pd.qcut(df['dist_move_prev'], 50, duplicates='drop')# 2时刻距离等频分箱50  
df['dist_move_prev_bin'] = df['dist_move_prev_bin'].map(
    dict(zip(df['dist_move_prev_bin'].unique(), range(df['dist_move_prev_bin'].nunique())))    
) 
new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()

0
1
2
3
4


In [79]:
pre_cols = df.columns

def start(x):
    try:
        return x[0]
    except:
        return None

def end(x):
    try:
        return x[-1]
    except:
        return None


def mode(x):
    try:
        return pd.Series(x).value_counts().index[0]  
    except:
        return None

for f in ['dist_move_prev_bin', 'v_bin']:
    # 上一时刻类别 速度类别映射处理
    df[f + '_sen'] = df['id'].map(df.groupby('id')[f].agg(lambda x: ','.join(x.astype(str))))
    
    # 一系列基本统计量特征 每列执行相应的操作
g = df.groupby('id').agg({
    # numpy.ptp()函数计算数组中元素最大值与最小值的差（最大值 - 最小值）。
    # https://www.runoob.com/numpy/numpy-statistical-functions.html
    'id': ['count'], 'x_bin1': [mode], 'y_bin1': [mode], 'x_bin2': [mode], 'y_bin2': [mode], 'x_y_bin1': [mode],
    'x': ['mean', 'max', 'min', 'std', np.ptp, start, end],
    'y': ['mean', 'max', 'min', 'std', np.ptp, start, end],
    'v': ['mean', 'max', 'min', 'std', np.ptp], 'dir': ['mean'],
    'x_bin1_count': ['mean'], 'y_bin1_count': ['mean', 'max', 'min'],
    'x_bin2_count': ['mean', 'max', 'min'], 'y_bin2_count': ['mean', 'max', 'min'],
    'x_bin1_y_bin1_count': ['mean', 'max', 'min'],
    'dist_move_prev': ['mean', 'max', 'std', 'min', 'sum'],
    'x_y_min': ['mean', 'min'], 'y_x_min': ['mean', 'min'],
    'x_y_max': ['mean', 'min'], 'y_x_max': ['mean', 'min'],
}).reset_index()
g.columns = ['_'.join(col).strip() for col in g.columns] 
g.rename(columns={'id_': 'id'}, inplace=True) 
cols = [f for f in g.keys() if f != 'id'] 

In [80]:
# g是一系列基本统计量特征 每列执行相应的操作
df = df.merge(g,on='id',how='left')

new_cols = [i for i in df.columns if i not in pre_cols]   # 统计出新添的列
df[new_cols].head()

Unnamed: 0,dist_move_prev_bin_sen,v_bin_sen,id_count,x_bin1_mode,y_bin1_mode,x_bin2_mode,y_bin2_mode,x_y_bin1_mode,x_mean,x_max,...,dist_move_prev_min,dist_move_prev_sum,x_y_min_mean,x_y_min_min,y_x_min_mean,y_x_min_min,x_y_max_mean,x_y_max_min,y_x_max_mean,y_x_max_min
0,"nan,1.0,1.0,2.0,3.0,4.0,2.0,5.0,3.0,5.0,5.0,5....","19.0,26.0,19.0,2.0,16.0,0.0,30.0,19.0,19.0,0.0...",411,145,88,611.0,508.0,252,6123711.0,6151439.0,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374
1,"nan,1.0,1.0,2.0,3.0,4.0,2.0,5.0,3.0,5.0,5.0,5....","19.0,26.0,19.0,2.0,16.0,0.0,30.0,19.0,19.0,0.0...",411,145,88,611.0,508.0,252,6123711.0,6151439.0,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374
2,"nan,1.0,1.0,2.0,3.0,4.0,2.0,5.0,3.0,5.0,5.0,5....","19.0,26.0,19.0,2.0,16.0,0.0,30.0,19.0,19.0,0.0...",411,145,88,611.0,508.0,252,6123711.0,6151439.0,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374
3,"nan,1.0,1.0,2.0,3.0,4.0,2.0,5.0,3.0,5.0,5.0,5....","19.0,26.0,19.0,2.0,16.0,0.0,30.0,19.0,19.0,0.0...",411,145,88,611.0,508.0,252,6123711.0,6151439.0,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374
4,"nan,1.0,1.0,2.0,3.0,4.0,2.0,5.0,3.0,5.0,5.0,5....","19.0,26.0,19.0,2.0,16.0,0.0,30.0,19.0,19.0,0.0...",411,145,88,611.0,508.0,252,6123711.0,6151439.0,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374


In [81]:
def group_feature(df, key, target, aggs,flag):  
#     agg_dict = {}
    agg_list = []
    for ag in aggs:
#         agg_dict['{}_{}_{}'.format(target,ag,flag)] = ag   direction_max,mean,std,_0或1
#     print(agg_dict)
#     t = df.groupby(key)[target].agg([(agg_dict)]).reset_index()
        agg_list.append(['{}_{}_{}'.format(target,ag,flag),ag])
    t = df.groupby(key)[target].agg(agg_list).reset_index()     
    return t                      # [[name,function],[name1,function1]....]

def extract_feature(df, train, flag):    
    '''
    统计feature
    注意理解group_feature的使用和效果
    '''
    
    if (flag == 'on_night') or (flag == 'on_day'): 
        t = group_feature(df, 'ship','speed',['max','mean','median','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left')
        # return train
    
    if flag == "0":
        t = group_feature(df, 'ship','direction',['max','median','mean','std','skew'],flag)
        # 合并的是去重的后的data  ['max','median','mean','std','skew'] 表的合并 df.groupby(key)['direction']  这个是黑夜的数据  
        train = pd.merge(train, t, on='ship', how='left')  
    elif flag == "1":
        t = group_feature(df, 'ship','speed',['max','mean','median','std','skew'],flag)
        # 合并的是去重的后的data  ['max','median','mean','std','skew'] 表的合并 df.groupby(key)['speed']      这个是白天的数据  
        train = pd.merge(train, t, on='ship', how='left')   
        t = group_feature(df, 'ship','direction',['max','median','mean','std','skew'],flag)
        # 合并的是去重的后的data  ['max','median','mean','std','skew'] 表的合并 df.groupby(key)['direction']  这个是白天的数据  
        train = pd.merge(train, t, on='ship', how='left') 
        
        # .nunique().to_dict() 将nunique得到的对应唯一值统计量做成字典
        # to_dict() 与 map的使用可以很方便地构建一些统计量映射特征，如CTR（分类）问题中的转化率
        # 提问： 如果根据训练集给定的label(0,1)来构建训练集+测试集的转化率特征，注：测试集与训练集存在部分id相同
        # 答 遇到这种问题  立即推，立马放弃   
        # https://blog.csdn.net/weixin_39791387/article/details/87627235
        hour_nunique = df.groupby('ship')['speed'].nunique().to_dict()  # 不理解 做成list的字典 还是series字典  亦或是dict字典
        # flag是01白天黑夜 这个数有多少个 之前是dict(zip( ..unique(),range(..nunique()) ))   数量映射  没有构造新表 直接替换掉了数据
        train['speed_nunique_{}'.format(flag)] = train['ship'].map(hour_nunique)  
        
        hour_nunique = df.groupby('ship')['direction'].nunique().to_dict()   # 同上
        train['direction_nunique_{}'.format(flag)] = train['ship'].map(hour_nunique)  #       数量映射  没有构造新表 直接替换掉了数据
    
    # 合并的是去重的后的data  ['max','median','mean','std','skew'] 表的合并 df.groupby(key)['direction']  白天黑夜  x y 
    # 合并speed direction x y base_dis_diff  这些表   有种感觉好像前面干过这事
    t = group_feature(df, 'ship','x',['max','min','mean','median','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','median','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','base_dis_diff',['max','min','mean','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')

    #  flag 是 01 白天黑夜  
    train['x_max_x_min_{}'.format(flag)] = train['x_max_{}'.format(flag)] - train['x_min_{}'.format(flag)]
    train['y_max_y_min_{}'.format(flag)] = train['y_max_{}'.format(flag)] - train['y_min_{}'.format(flag)]
    train['y_max_x_min_{}'.format(flag)] = train['y_max_{}'.format(flag)] - train['x_min_{}'.format(flag)]
    train['x_max_y_min_{}'.format(flag)] = train['x_max_{}'.format(flag)] - train['y_min_{}'.format(flag)]
    
   
    train['slope_{}'.format(flag)] = train['y_max_y_min_{}'.format(flag)] / np.where(train['x_max_x_min_{}'.format(flag)]==0, 0.001, train['x_max_x_min_{}'.format(flag)])
    # 面积  可以说是渔船的活动区域
    train['area_{}'.format(flag)] = train['x_max_x_min_{}'.format(flag)] * train['y_max_y_min_{}'.format(flag)] 
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()  # 数量映射的字典  index(0) 是所有value_counts()值
    train['mode_hour_{}'.format(flag)] = train['ship'].map(mode_hour)     # 数量映射    下面是中位数 斜率slope
    train['slope_median_{}'.format(flag)] = train['y_median_{}'.format(flag)] / np.where(train['x_median_{}'.format(flag)]==0, 0.001, train['x_median_{}'.format(flag)])

    return train

In [82]:
data  = df.copy()

data.rename(columns={
    'id':'ship',
    'v':'speed',
    'dir':'direction'
},inplace=True)

data_label = data.drop_duplicates(['ship'],keep = 'first')


data_1 = data[data['speed']==0]
data_2 = data[data['speed']!=0]

data_label = extract_feature(data_1, data_label,"0")   
data_label = extract_feature(data_2, data_label,"1")   

data_1 = data[data['day_nig'] == 0]
data_2 = data[data['day_nig'] == 1]
data_label = extract_feature(data_1, data_label,"on_night")
data_label = extract_feature(data_2, data_label,"on_day")    
data_label.rename(columns={'ship':'id','speed':'v','direction':'dir'},inplace=True)
data.head()  

Unnamed: 0,ship,x,y,speed,direction,time,label,date,hour,month,...,dist_move_prev_min,dist_move_prev_sum,x_y_min_mean,x_y_min_min,y_x_min_mean,y_x_min_min,x_y_max_mean,x_y_max_min,y_x_max_mean,y_x_max_min
0,100,6142732.0,5077130.0,3.78,117,1900-10-28 00:01:18,0,1900-10-28,0,10,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374
1,100,6143644.0,5076674.0,3.45,114,1900-10-28 00:11:18,0,1900-10-28,0,10,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374
2,100,6144556.0,5076218.0,3.78,122,1900-10-28 00:21:19,0,1900-10-28,0,10,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374
3,100,6145474.0,5076198.0,3.18,115,1900-10-28 00:31:18,0,1900-10-28,0,10,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374
4,100,6146072.0,5075205.0,3.99,142,1900-10-28 00:41:19,0,1900-10-28,0,10,...,0.0,381420.840554,2458.92664,0.0,4603.814472,0.0,-5075.500661,-57432.286364,-3493.862248,-32066.348374


In [83]:
new_cols = [i for i in data_label.columns if i not in df.columns]
df = df.merge(data_label[new_cols+['id']],on='id',how='left')

df[new_cols].head()

Unnamed: 0,direction_max_0,direction_median_0,direction_mean_0,direction_std_0,direction_skew_0,x_max_0,x_min_0,x_mean_0,x_median_0,x_std_0,...,base_dis_diff_std_on_day,base_dis_diff_skew_on_day,x_max_x_min_on_day,y_max_y_min_on_day,y_max_x_min_on_day,x_max_y_min_on_day,slope_on_day,area_on_day,mode_hour_on_day,slope_median_on_day
0,0,0.0,0.0,0.0,0.0,6102751.0,6102751.0,6102751.0,6102751.0,0.0,...,9650.263589,-0.389598,45396.666092,43135.705758,-989573.982047,1078106.0,0.950195,1958217000.0,19,0.831333
1,0,0.0,0.0,0.0,0.0,6102751.0,6102751.0,6102751.0,6102751.0,0.0,...,9650.263589,-0.389598,45396.666092,43135.705758,-989573.982047,1078106.0,0.950195,1958217000.0,19,0.831333
2,0,0.0,0.0,0.0,0.0,6102751.0,6102751.0,6102751.0,6102751.0,0.0,...,9650.263589,-0.389598,45396.666092,43135.705758,-989573.982047,1078106.0,0.950195,1958217000.0,19,0.831333
3,0,0.0,0.0,0.0,0.0,6102751.0,6102751.0,6102751.0,6102751.0,0.0,...,9650.263589,-0.389598,45396.666092,43135.705758,-989573.982047,1078106.0,0.950195,1958217000.0,19,0.831333
4,0,0.0,0.0,0.0,0.0,6102751.0,6102751.0,6102751.0,6102751.0,0.0,...,9650.263589,-0.389598,45396.666092,43135.705758,-989573.982047,1078106.0,0.950195,1958217000.0,19,0.831333


In [84]:
temp = df.copy()
temp.rename(columns={'id':'ship','dir':'d'},inplace=True)

def coefficient_of_variation(x):
    x = x.values
    if np.mean(x) == 0:
        return 0
    return np.std(x) / np.mean(x)

def max_2(x):
    x = list(x.values)
    x.sort(reverse=True)
    return x[1]

def max_3(x):
    x = list(x.values)
    x.sort(reverse=True)
    return x[2]

def diff_abs_mean(x):  # 统计特征 deta绝对值均值
    return np.mean(np.abs(np.diff(x)))

f1 = pd.DataFrame()
for col in ['x', 'y', 'v', 'd']:
    features = temp.groupby('ship', as_index=False)[col].agg({
        '{}_min'.format(col): 'min',
        '{}_max'.format(col): 'max',
        '{}_mean'.format(col): 'mean',
        '{}_median'.format(col): 'median',
        '{}_std'.format(col): 'std',
        '{}_skew'.format(col): 'skew',
        '{}_sum'.format(col): 'sum',
        '{}_diff_abs_mean'.format(col): diff_abs_mean,
        '{}_mode'.format(col): lambda x: x.value_counts().index[0],
        '{}_coefficient_of_variation'.format(col): coefficient_of_variation,
        '{}_max2'.format(col): max_2,  # 排序后 返回最大值
        '{}_max3'.format(col): max_3   # 排序后 返回第二大的值
    })
    if f1.shape[0] == 0:
        f1 = features
    else:
        f1 = f1.merge(features, on='ship', how='left')
   
f1['x_max_x_min'] = f1['x_max'] - f1['x_min']
f1['y_max_y_min'] = f1['y_max'] - f1['y_min']
f1['y_max_x_min'] = f1['y_max'] - f1['x_min']
f1['x_max_y_min'] = f1['x_max'] - f1['y_min']
f1['slope'] = f1['y_max_y_min'] / np.where(f1['x_max_x_min'] == 0, 0.001, f1['x_max_x_min'])
f1['area'] = f1['x_max_x_min'] * f1['y_max_y_min']
f1['dis_max_min'] = (f1['x_max_x_min'] ** 2 + f1['y_max_y_min'] ** 2) ** 0.5   
f1['dis_mean'] = (f1['x_mean'] ** 2 + f1['y_mean'] ** 2) ** 0.5               
f1['area_d_dis_max_min'] = f1['area'] / f1['dis_max_min']                     

.  
temp.sort_values(['ship', 'time'], ascending=True, inplace=True)
temp['ynext'] = temp.groupby('ship')['y'].shift(-1)
temp['xnext'] = temp.groupby('ship')['x'].shift(-1)
temp['ynext'] = temp['ynext'].fillna(method='ffill')
temp['xnext'] = temp['xnext'].fillna(method='ffill')
temp['timenext'] = temp.groupby('ship')['time'].shift(-1)
temp['timediff'] = np.abs(temp['timenext'] - temp['time'])
temp['a_y'] = temp.apply(lambda x: (x['ynext'] - x['y']) / x['timediff'].total_seconds(), axis=1)# y上的速度
temp['a_x'] = temp.apply(lambda x: (x['xnext'] - x['x']) / x['timediff'].total_seconds(), axis=1)# x 上的速度
for col in ['a_y', 'a_x']:
    f2 = temp.groupby('ship', as_index=False)[col].agg({
        '{}_max'.format(col): 'max',
        '{}_mean'.format(col): 'mean',
        '{}_min'.format(col): 'min',
        '{}_median'.format(col): 'median',
        '{}_std'.format(col): 'std'})
    f1 = f1.merge(f2, on='ship', how='left')

# 曲率  
temp['y_pre'] = temp.groupby('ship')['y'].shift(1)
temp['x_pre'] = temp.groupby('ship')['x'].shift(1)
temp['y_pre'] = temp['y_pre'].fillna(method='bfill')
temp['x_pre'] = temp['x_pre'].fillna(method='bfill')   # 向前插值 
temp['d_pre'] = ((temp['x'] - temp['x_pre']) ** 2 + (temp['y'] - temp['y_pre']) ** 2) ** 0.5
temp['d_next'] = ((temp['xnext'] - temp['x']) ** 2 + (temp['ynext'] - temp['y']) ** 2) ** 0.5
temp['d_pre_next'] = ((temp['xnext'] - temp['x_pre']) ** 2 + (temp['ynext'] - temp['y_pre']) ** 2) ** 0.5
temp['curvature'] = (temp['d_pre'] + temp['d_next']) / temp['d_pre_next']

f2 = temp.groupby('ship', as_index=False)['curvature'].agg({
    'curvature_max': 'max',
    'curvature_mean': 'mean',
    'curvature_min': 'min',
    'curvature_median': 'median',
    'curvature_std': 'std'})
f1 = f1.merge(f2, on='ship', how='left')

SyntaxError: invalid syntax (<ipython-input-84-14183564f6d8>, line 54)

In [85]:
def traj_cbow_embedding(traj_data_corpus=None, embedding_size=70,   # traj_data_corpus =df
                        iters=40, min_count=3, window_size=25,
                        seed=9012, num_runs=5, word_feat="no_bin"):
    """CBOW embedding for trajectory data."""
    boat_id = traj_data_corpus['id'].unique()  # df.['id'].unique()
    sentences, embedding_df_list, embedding_model_list = [], [], []
    for i in boat_id:
        traj = traj_data_corpus[traj_data_corpus['id']==i]  
        sentences.append(traj[word_feat].values.tolist())   
    print("\n@Start CBOW word embedding at {}".format(datetime.now()))
    print("-------------------------------------------")
    for i in tqdm(range(num_runs)):     # 为什么要循环？ 
        model = Word2Vec(sentences, vector_size=embedding_size,
                                  min_count=min_count,
                                  workers=mp.cpu_count(),
                                  window=window_size,
                                  seed=seed, epochs=iters, sg=0)   # 初始化模型  调用接口

        # Sentance vector
#         print(str(model))
        embedding_vec = []
    # https://www.runoob.com/python/python-func-enumerate.html 与猜测一至  类似于olist()
        for ind, seq in enumerate(sentences):  
            seq_vec, word_count = 0, 0
            for word in seq:                  
                if word not in model.wv:      
                    continue                 
                else:                        
                    seq_vec += model.wv[word] 
                    word_count += 1          
            if word_count == 0:            
                embedding_vec.append(embedding_size * [0])  
            else:
                embedding_vec.append(seq_vec / word_count)  
        embedding_vec = np.array(embedding_vec)             
        embedding_cbow_df = pd.DataFrame(embedding_vec, 
            columns=["embedding_cbow_{}_{}".format(word_feat, i) for i in range(embedding_size)])  # 弄成pd.DataFrame格式
        embedding_cbow_df["id"] = boat_id        
        embedding_df_list.append(embedding_cbow_df)   
        embedding_model_list.append(model)            
    print("-------------------------------------------")
    print("@End CBOW word embedding at {}".format(datetime.now()))
    return embedding_df_list, embedding_model_list
 

In [86]:
embedding_size=70
iters=70
min_count=3
window_size=25
num_runs=1

df_list, model_list = traj_cbow_embedding(df,
                                          embedding_size=embedding_size,
                                          iters=iters, min_count=min_count,
                                          window_size=window_size,
                                          seed=9012,
                                          num_runs=num_runs,
                                          word_feat="no_bin")

train_embedding_df_list = [d.reset_index(drop=True) for d in df_list]  
fea = pd.DataFrame(fea)  

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]


@Start CBOW word embedding at 2021-04-20 23:40:24.118562
-------------------------------------------


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.96it/s]

-------------------------------------------
@End CBOW word embedding at 2021-04-20 23:40:24.630196





In [87]:
pre_cols = df.columns
df = df.merge(fea,on='id',how='left')


new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()   

Unnamed: 0,embedding_cbow_no_bin_0,embedding_cbow_no_bin_1,embedding_cbow_no_bin_2,embedding_cbow_no_bin_3,embedding_cbow_no_bin_4,embedding_cbow_no_bin_5,embedding_cbow_no_bin_6,embedding_cbow_no_bin_7,embedding_cbow_no_bin_8,embedding_cbow_no_bin_9,...,embedding_cbow_no_bin_60,embedding_cbow_no_bin_61,embedding_cbow_no_bin_62,embedding_cbow_no_bin_63,embedding_cbow_no_bin_64,embedding_cbow_no_bin_65,embedding_cbow_no_bin_66,embedding_cbow_no_bin_67,embedding_cbow_no_bin_68,embedding_cbow_no_bin_69
0,-0.683458,0.37352,0.49113,0.505543,0.689314,-0.214121,-0.05189,0.639845,0.51848,-0.002312,...,0.584187,-0.771213,0.680816,0.550508,0.275442,0.154472,-0.88218,-0.295531,-0.38618,-0.169952
1,-0.683458,0.37352,0.49113,0.505543,0.689314,-0.214121,-0.05189,0.639845,0.51848,-0.002312,...,0.584187,-0.771213,0.680816,0.550508,0.275442,0.154472,-0.88218,-0.295531,-0.38618,-0.169952
2,-0.683458,0.37352,0.49113,0.505543,0.689314,-0.214121,-0.05189,0.639845,0.51848,-0.002312,...,0.584187,-0.771213,0.680816,0.550508,0.275442,0.154472,-0.88218,-0.295531,-0.38618,-0.169952
3,-0.683458,0.37352,0.49113,0.505543,0.689314,-0.214121,-0.05189,0.639845,0.51848,-0.002312,...,0.584187,-0.771213,0.680816,0.550508,0.275442,0.154472,-0.88218,-0.295531,-0.38618,-0.169952
4,-0.683458,0.37352,0.49113,0.505543,0.689314,-0.214121,-0.05189,0.639845,0.51848,-0.002312,...,0.584187,-0.771213,0.680816,0.550508,0.275442,0.154472,-0.88218,-0.295531,-0.38618,-0.169952


In [88]:
boat_id = df['id'].unique()
total_embedding = pd.DataFrame(boat_id, columns=["id"])
traj_data = df[['v','dir','id']].rename(columns = {'v':'speed','dir':'direction'})  
# Step 1: Construct the words
traj_data_corpus = []
traj_data["speed_str"]     = traj_data["speed"].apply(lambda x: str(int(x*100))) 
traj_data["direction_str"] = traj_data["direction"].apply(str)  
traj_data["speed_dir_str"] = traj_data["speed_str"] + "_" + traj_data["direction_str"] 
traj_data_corpus = traj_data[["id", "speed_str",
                                  "direction_str", "speed_dir_str"]]   
print("\n@Round 2 speed embedding:")
df_list, model_list = traj_cbow_embedding(traj_data_corpus,
                                          embedding_size=10,
                                          iters=40, min_count=3,
                                          window_size=25, seed=9102,
                                          num_runs=1, word_feat="speed_str")  
speed_embedding = df_list[0].reset_index(drop=True)   
total_embedding = pd.merge(total_embedding, speed_embedding,
                           on="id", how="left")     


print("\n@Round 2 direction embedding:")
df_list, model_list = traj_cbow_embedding(traj_data_corpus,
                                          embedding_size=12,
                                          iters=70, min_count=3,
                                          window_size=25, seed=9102,
                                          num_runs=1, word_feat="speed_dir_str")
speed_dir_embedding = df_list[0].reset_index(drop=True)
total_embedding = pd.merge(total_embedding, speed_dir_embedding,
                           on="id", how="left")

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]


@Round 2 speed embedding:

@Start CBOW word embedding at 2021-04-20 23:40:25.663431
-------------------------------------------


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.44it/s]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

-------------------------------------------
@End CBOW word embedding at 2021-04-20 23:40:26.076328

@Round 2 direction embedding:

@Start CBOW word embedding at 2021-04-20 23:40:26.089291
-------------------------------------------


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.61it/s]

-------------------------------------------
@End CBOW word embedding at 2021-04-20 23:40:26.712626





In [89]:
pre_cols = df.columns
df = df.merge(total_embedding,on='id',how='left')

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()

Unnamed: 0,embedding_cbow_speed_str_0,embedding_cbow_speed_str_1,embedding_cbow_speed_str_2,embedding_cbow_speed_str_3,embedding_cbow_speed_str_4,embedding_cbow_speed_str_5,embedding_cbow_speed_str_6,embedding_cbow_speed_str_7,embedding_cbow_speed_str_8,embedding_cbow_speed_str_9,...,embedding_cbow_speed_dir_str_2,embedding_cbow_speed_dir_str_3,embedding_cbow_speed_dir_str_4,embedding_cbow_speed_dir_str_5,embedding_cbow_speed_dir_str_6,embedding_cbow_speed_dir_str_7,embedding_cbow_speed_dir_str_8,embedding_cbow_speed_dir_str_9,embedding_cbow_speed_dir_str_10,embedding_cbow_speed_dir_str_11
0,0.336223,-0.174119,-0.080473,-1.693435,1.324486,2.001523,-0.155504,-3.078485,1.406705,1.961885,...,-1.660313,1.619857,-1.981154,2.068269,-2.446373,-2.371182,-1.59563,0.49088,2.649754,1.255642
1,0.336223,-0.174119,-0.080473,-1.693435,1.324486,2.001523,-0.155504,-3.078485,1.406705,1.961885,...,-1.660313,1.619857,-1.981154,2.068269,-2.446373,-2.371182,-1.59563,0.49088,2.649754,1.255642
2,0.336223,-0.174119,-0.080473,-1.693435,1.324486,2.001523,-0.155504,-3.078485,1.406705,1.961885,...,-1.660313,1.619857,-1.981154,2.068269,-2.446373,-2.371182,-1.59563,0.49088,2.649754,1.255642
3,0.336223,-0.174119,-0.080473,-1.693435,1.324486,2.001523,-0.155504,-3.078485,1.406705,1.961885,...,-1.660313,1.619857,-1.981154,2.068269,-2.446373,-2.371182,-1.59563,0.49088,2.649754,1.255642
4,0.336223,-0.174119,-0.080473,-1.693435,1.324486,2.001523,-0.155504,-3.078485,1.406705,1.961885,...,-1.660313,1.619857,-1.981154,2.068269,-2.446373,-2.371182,-1.59563,0.49088,2.649754,1.255642


In [90]:
class nmf_list(object):
    def __init__(self,data,by_name,to_list,nmf_n,top_n):
        self.data = data
        self.by_name = by_name
        self.to_list = to_list
        self.nmf_n = nmf_n
        self.top_n = top_n

    def run(self,tf_n):
        df_all = self.data.groupby(self.by_name)[self.to_list].apply(lambda x :'|'.join(x)).reset_index()
        self.data =df_all.copy()

        print('bulid word_fre')
        # 词频的构建
        def word_fre(x):
            word_dict = []
            x = x.split('|')
            docs = []
            for doc in x:
                doc = doc.split()
                docs.append(doc)
                word_dict.extend(doc)
            word_dict = Counter(word_dict)
            new_word_dict = {}
            for key,value in word_dict.items():
                new_word_dict[key] = [value,0]
            del word_dict  
            del x
            for doc in docs:
                doc = Counter(doc)
                for word in doc.keys():
                    new_word_dict[word][1] += 1
            return new_word_dict 
        self.data['word_fre'] = self.data[self.to_list].apply(word_fre)

        print('bulid top_' + str(self.top_n))
        # 设定100个高频词
        def top_100(word_dict):
            return sorted(word_dict.items(),key = lambda x:(x[1][1],x[1][0]),reverse = True)[:self.top_n]
        self.data['top_'+str(self.top_n)] = self.data['word_fre'].apply(top_100)
        def top_100_word(word_list):
            words = []
            for i in word_list:
                i = list(i)
                words.append(i[0])
            return words 
        self.data['top_'+str(self.top_n)+'_word'] = self.data['top_' + str(self.top_n)].apply(top_100_word)
        # print('top_'+str(self.top_n)+'_word的shape')
        print(self.data.shape)

        word_list = []
        for i in self.data['top_'+str(self.top_n)+'_word'].values:
            word_list.extend(i)
        word_list = Counter(word_list)
        word_list = sorted(word_list.items(),key = lambda x:x[1],reverse = True)
        user_fre = []
        for i in word_list:
            i = list(i)
            user_fre.append(i[1]/self.data[self.by_name].nunique())
        stop_words = []
        for i,j in zip(word_list,user_fre):
            if j>0.5:
                i = list(i)
                stop_words.append(i[0])

        print('start title_feature')
        # 讲融合后的taglist当作一句话进行文本处理
        self.data['title_feature'] = self.data[self.to_list].apply(lambda x: x.split('|'))
        self.data['title_feature'] = self.data['title_feature'].apply(lambda line: [w for w in line if w not in stop_words])
        self.data['title_feature'] = self.data['title_feature'].apply(lambda x: ' '.join(x))

        print('start NMF')
        # 使用tfidf对元素进行处理
        tfidf_vectorizer = TfidfVectorizer(ngram_range=(tf_n,tf_n))
        tfidf = tfidf_vectorizer.fit_transform(self.data['title_feature'].values)
        #使用nmf算法，提取文本的主题分布
        text_nmf = NMF(n_components=self.nmf_n).fit_transform(tfidf)


        # 整理并输出文件
        name = [str(tf_n) + self.to_list + '_' +str(x) for x in range(1,self.nmf_n+1)]
        tag_list = pd.DataFrame(text_nmf)
        print(tag_list.shape)
        tag_list.columns = name
        tag_list[self.by_name] = self.data[self.by_name]
        column_name = [self.by_name] + name
        tag_list = tag_list[column_name]
        return tag_list

In [91]:
data = df.copy()
data.rename(columns={'v':'speed','id':'ship'},inplace=True)
for j in range(1,4):
    print('********* {} *******'.format(j))
    for i in ['speed','x','y']:
        data[i + '_str'] = data[i].astype(str)
        nmf = nmf_list(data,'ship',i + '_str',8,2)
        nmf_a = nmf.run(j)
        nmf_a.rename(columns={'ship':'id'},inplace=True)
        data_label = data_label.merge(nmf_a,on = 'id',how = 'left')

********* 1 *******
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)
********* 2 *******
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)
********* 3 *******
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)
bulid word_fre
bulid top_2
(6, 5)
start title_feature
start NMF
(6, 8)


In [92]:
new_cols = [i for i in data_label.columns if i not in df.columns]
df = df.merge(data_label[new_cols+['id']],on='id',how='left')

df[new_cols].head()

Unnamed: 0,1speed_str_1,1speed_str_2,1speed_str_3,1speed_str_4,1speed_str_5,1speed_str_6,1speed_str_7,1speed_str_8,1x_str_1,1x_str_2,...,3x_str_7,3x_str_8,3y_str_1,3y_str_2,3y_str_3,3y_str_4,3y_str_5,3y_str_6,3y_str_7,3y_str_8
0,0.305834,0.0,0.000135,0.0,0.0,0.000602,0.710835,0.0,0.0,1.401298e-45,...,0.0,0.0,0.028499,0.0,0.0,0.0,0.0,0.225937,0.0,0.0
1,0.305834,0.0,0.000135,0.0,0.0,0.000602,0.710835,0.0,0.0,1.401298e-45,...,0.0,0.0,0.028499,0.0,0.0,0.0,0.0,0.225937,0.0,0.0
2,0.305834,0.0,0.000135,0.0,0.0,0.000602,0.710835,0.0,0.0,1.401298e-45,...,0.0,0.0,0.028499,0.0,0.0,0.0,0.0,0.225937,0.0,0.0
3,0.305834,0.0,0.000135,0.0,0.0,0.000602,0.710835,0.0,0.0,1.401298e-45,...,0.0,0.0,0.028499,0.0,0.0,0.0,0.0,0.225937,0.0,0.0
4,0.305834,0.0,0.000135,0.0,0.0,0.000602,0.710835,0.0,0.0,1.401298e-45,...,0.0,0.0,0.028499,0.0,0.0,0.0,0.0,0.225937,0.0,0.0
