In [30]:
import os
import pandas as pd
from tqdm import tqdm
import math

In [31]:
def get_dist(a, b, data):
    dist = data.apply(lambda x: 
                            abs(x['x'] * b - x['y'] + a) / math.sqrt(pow(a,2) + 1), axis=1)
    return dist

In [32]:
def is_in_tuowang_field(x, y):
    a_high = 511313.8459492484     
    b_high = 0.7873931030524742
    
    a_low = 511313.8459492484     
    b_low = 0.7373931030524742
    y_low = a_low + b_low * x
    y_high = a_high + b_high * x
    if y >= y_low and y <= y_high:
        return 1
    return 0
def is_in_weiwang_field(x, y):
    a_left =  -7351080.045174466   
    b_left =  2.0843575670393215
                      
    a_low = 461313.8459492484          
    b_low =   0.7373931030524742
    
    a_right =  17428106.622326583
    b_right =  -1.6036148716094172
    
    y_left = a_left + b_left * x
    y_right = a_right + b_right * x
    y_low = a_low + b_low * x
    
    if y <= y_left and y <= y_right and y >= y_low:
        return 1
    return 0

In [60]:
def get_feature(folder, train=True):
    file_list = os.listdir(folder)
    x = 'x'
    y='y' 
    v = '速度'
    d = '方向'
    t = 'type'
    feature = []
    # 线性回归方程的 a , b. ==> 直线为 b*x - y + a = 0
    tuo = [611313.8459492484, 0.7373931030524742]
    wei = [225281.54478179757, 0.809302650345197]
    ci = []
    for file_path in tqdm(file_list):
        df = pd.read_csv(os.path.join(folder, file_path))
        # 线性回归方程的 a , b. ==> 直线为 b*x - y + a = 0
        tuo_dist = get_dist(tuo[0], tuo[1], df)
        wei_dist = get_dist(wei[0], wei[1], df)
        
        x_max = df[x].max()
        x_min = df[x].min()
        
        y_max = df[y].max()
        y_min = df[y].min()
        
        s = (x_max - x_min) * (y_max - y_min)
        # 计算区域内的点数
        p_weiwang_field = 'weiwangpercent'
        p_tuowang_field = 'tuowangpercent'
        
        df[p_weiwang_field] = df.apply(lambda data: is_in_weiwang_field(data[x], data[y]), axis=1)
        df[p_tuowang_field] = df.apply(lambda data: is_in_tuowang_field(data[x], data[y]), axis=1)
        
        p_weiwang = df[p_weiwang_field].mean()
        p_tuowang = df[p_tuowang_field].mean()
        
        if train:
            if df[t][1] == '刺网':
                df = df[df[v] < 18]
            elif df[t][1] == '围网':
                df = df[df[v] < 40]
            elif df[t][1] == '拖网':
                df = df[df[v] < 25]
            feature.append([
                df[x].mean(), df[x].std(),df[x].quantile(0.1), df[x].median(),df[x].kurt(),df[x].skew(), 
                df[y].mean(), df[y].std(),df[y].quantile(0.1), df[y].median(),df[y].kurt(),df[y].skew(), 
                df[v].mean(),df[v].std(),  df[v].quantile(0.1), df[v].median(),df[v].kurt(),df[v].skew(),
                df[d].mean(), df[d].std(),df[d].quantile(0.1), df[d].median(),df[d].kurt(),df[d].skew(), 
                tuo_dist.mean(), tuo_dist.std(),tuo_dist.quantile(0.1), tuo_dist.median(),tuo_dist.kurt(),tuo_dist.skew(), 
                wei_dist.mean(), wei_dist.std(),wei_dist.quantile(0.1), wei_dist.median(),wei_dist.kurt(), wei_dist.skew(), 
                s,
                p_weiwang,
                p_tuowang,
                df[t][1]
            ])
        else:
            df = df[df[v] < 40]
            feature.append([
                df[x].mean(), df[x].std(),df[x].quantile(0.1), df[x].median(),df[x].kurt(),df[x].skew(),
                df[y].mean(), df[y].std(),df[y].quantile(0.1), df[y].median(),df[y].kurt(),df[y].skew(),
                df[v].mean(), df[v].std(),  df[v].quantile(0.1), df[v].median(),df[v].kurt(),df[v].skew(),
                df[d].mean(), df[d].std(),df[d].quantile(0.1), df[d].median(),df[d].kurt(),df[d].skew(),
                tuo_dist.mean(), tuo_dist.std(),tuo_dist.quantile(0.1), tuo_dist.median(),tuo_dist.kurt(),tuo_dist.skew(), 
                wei_dist.mean(), wei_dist.std(),wei_dist.quantile(0.1), wei_dist.median(),wei_dist.kurt(), wei_dist.skew(), 
                s,
                p_weiwang,
                p_tuowang,
                df['渔船ID'][1]
            ])
       
    return feature

In [34]:
def gen_header(train=True):
    list_name = ['x', 'y', 'v', 'd', 'tuo_dist', 'wei_dist']
    list_statistic_type = ['mean', 'std', 'quantile', 'median', 'kurt', 'skew']
    header_list = []
    for name in list_name:
        for sta_type in list_statistic_type:
            header_list.append(name + '_' + sta_type)
    header_list.append('s')
    header_list.append('p_weiwang')
    header_list.append('p_tuowang')
    if train:
        header_list.append('type')
    else:
        header_list.append('id')
    return header_list
# gen_header(False)

In [35]:
train_folder = './data/train'
train_list = os.listdir(train_folder)
file_path = os.path.join(train_folder, train_list[3])
# 线性回归方程的 a , b. ==> 直线为 b*x - y + a = 0
tuo = [611313.8459492484, 0.7373931030524742]
wei = [225281.54478179757, 0.809302650345197]
ci = []

In [36]:
train_feature = './data/train_feature.csv'
train_list = os.listdir(train_folder)

df = pd.read_csv(os.path.join(train_folder, train_list[5]))
print(len(df))
if df['type'][1] == '刺网':
    df = df[df['速度'] < 18]
    
elif df['type'][1] == '围网':
    df = df[df['速度'] < 40]
elif df['type'][1] == '拖网':
    df = df[df['速度'] < 25]
print(len(df))
df.head()

418
418


Unnamed: 0,渔船ID,x,y,速度,方向,time,type
0,2431,6150677.0,5206254.0,0.11,255,1120 23:57:38,拖网
1,2431,6150677.0,5206254.0,0.0,71,1120 23:47:33,拖网
2,2431,6150677.0,5206254.0,0.22,0,1120 23:37:26,拖网
3,2431,6150677.0,5206254.0,0.0,0,1120 23:27:30,拖网
4,2431,6150677.0,5206254.0,0.11,343,1120 23:17:37,拖网


In [37]:
tuo_dist = get_dist(tuo[0], tuo[1], df)
tuo_dist.describe()

count    418.000000
mean       0.082116
std        0.032689
min        0.001563
25%        0.097288
50%        0.097288
75%        0.097288
max        0.097465
dtype: float64

df.describe()

In [38]:
df['x'].describe()

count    4.180000e+02
mean     6.156782e+06
std      1.291076e+04
min      6.150677e+06
25%      6.150677e+06
50%      6.150677e+06
75%      6.150678e+06
max      6.191520e+06
Name: x, dtype: float64

In [39]:
df['x'].quantile(0.1)

6150676.55684983

In [40]:
df['x'].mode().max()

6150676.55684983

In [41]:
df['x'].skew()

1.7411518066701612

In [42]:
df['x'].kurt()

1.2108824542111307

In [43]:
df['x'].median()

6150676.55684983

In [44]:
feature = []
x = 'x'
y='y' 
v = '速度'
d = '方向'
feature.append(
[
    df[x].mean(), df[x].std(),df[x].quantile(0.1), df[x].median(),df[x].kurt(),df[x].skew(),
    df[y].mean(), df[y].std(),df[y].quantile(0.1), df[y].median(),df[y].kurt(),df[y].skew(), 
    df[v].mean(),df[v].std(),  df[v].quantile(0.1), df[v].median(),df[v].kurt(),df[v].skew(),
    df[d].mean(), df[d].std(),df[d].quantile(0.1), df[d].median(),df[d].kurt(),df[d].skew(), 
    tuo_dist.mean(), tuo_dist.std(),tuo_dist.quantile(0.1), tuo_dist.median(),tuo_dist.kurt(),tuo_dist.skew(), 
    df['type'][1],
]
)
feature

[[6156781.764826261,
  12910.755055464404,
  6150676.55684983,
  6150676.55684983,
  1.2108824542111307,
  1.7411518066701612,
  5200307.124997652,
  13549.697217824958,
  5171316.541708397,
  5206253.9928918965,
  2.1229692779215226,
  -1.9807332532472857,
  0.9477033492822966,
  2.0673264951252355,
  0.0,
  0.11,
  8.036296442669583,
  2.934759622087104,
  121.95215311004785,
  127.13193965045075,
  0.0,
  71.0,
  -1.3456328792245018,
  0.489220535276196,
  0.08211620512101836,
  0.032688632862351255,
  0.008546248547938184,
  0.09728828360269579,
  1.3202516955390724,
  -1.789281013905053,
  '拖网']]

In [45]:
dd = pd.DataFrame(feature) #header=['x_mean', 'x_std', 'y_mean', 'y_std', 'v_mean', 'v_std', 'd_mean', 'd_std'])
dd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,6156782.0,12910.755055,6150677.0,6150677.0,1.210882,1.741152,5200307.0,13549.697218,5171317.0,5206254.0,...,71.0,-1.345633,0.489221,0.082116,0.032689,0.008546,0.097288,1.320252,-1.789281,拖网


In [46]:
train_feature = get_feature('./data/train')

100%|██████████| 7000/7000 [08:16<00:00, 14.11it/s]


In [47]:
train_feature[1]

[6173433.03569141,
 12250.695539090997,
 6165485.157781342,
 6165586.352808776,
 -0.35974635746453165,
 1.1529518256108688,
 5197880.794459517,
 6039.038082236932,
 5187878.841721196,
 5201785.646861269,
 -0.4047012331869104,
 -1.09196409086351,
 0.7575621890547264,
 1.7276194185135572,
 0.0,
 0.22,
 10.266954715247856,
 3.249271551411719,
 117.06716417910448,
 110.20607749858377,
 0.0,
 108.5,
 -0.9397648811064441,
 0.5176541559645483,
 0.056141384760857095,
 0.024441779257173703,
 0.011927990502712737,
 0.07199403116409227,
 -0.5357443264780306,
 -1.0869200997306896,
 0.10465334769672091,
 0.07021310020456525,
 0.058759007351248234,
 0.05913167096585557,
 -0.5321135572976532,
 1.0890427497379833,
 709815272.3386025,
 1.0,
 1.0,
 '刺网']

In [48]:
fea_pd = pd.DataFrame(train_feature)
fea_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,6.247895e+06,4438.069089,6.246627e+06,6.246627e+06,13.235139,3.712603,5.239679e+06,5939.181795,5.241151e+06,5.241260e+06,...,0.186612,0.040645,0.175043,0.175043,11.122571,3.532651,8.723267e+08,1.000000,1.000000,拖网
1,6.173433e+06,12250.695539,6.165485e+06,6.165586e+06,-0.359746,1.152952,5.197881e+06,6039.038082,5.187879e+06,5.201786e+06,...,0.104653,0.070213,0.058759,0.059132,-0.532114,1.089043,7.098153e+08,1.000000,1.000000,刺网
2,7.053152e+06,12171.034972,7.028056e+06,7.057604e+06,0.108375,-1.241603,6.089065e+06,19760.522701,6.061634e+06,6.089506e+06,...,0.690907,0.094144,0.566603,0.739652,-0.163807,-0.662110,3.587348e+09,0.911846,0.129477,围网
3,6.711096e+06,60764.037193,6.658979e+06,6.663841e+06,-0.927983,0.631317,5.459586e+06,25339.521511,5.433949e+06,5.453345e+06,...,0.874476,0.142671,0.754016,0.804881,0.763413,1.095577,1.604409e+10,0.911765,0.651961,围网
4,6.194958e+06,12947.725471,6.178915e+06,6.199598e+06,-0.727048,-0.612727,5.134912e+06,24345.461517,5.113110e+06,5.128339e+06,...,0.461491,0.134213,0.162758,0.528028,0.299261,-1.231053,4.470275e+09,1.000000,1.000000,拖网
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,6.246219e+06,8.464273,6.246219e+06,6.246220e+06,136.327897,-11.712629,5.240884e+06,54.735139,5.240827e+06,5.240936e+06,...,0.175245,0.000246,0.175017,0.175017,-1.824149,0.015315,1.117086e+04,1.000000,1.000000,拖网
6996,6.182688e+06,31.795471,6.182677e+06,6.182677e+06,4.213629,2.488457,5.193143e+06,16.211066,5.193144e+06,5.193146e+06,...,0.158931,0.000146,0.158880,0.158880,9.312170,2.976058,1.142019e+04,1.000000,1.000000,拖网
6997,6.253173e+06,17.807910,6.253173e+06,6.253174e+06,29.649854,-0.933848,5.251121e+06,42.348460,5.251032e+06,5.251141e+06,...,0.154784,0.000189,0.154698,0.154698,1.719538,1.758882,2.266244e+04,1.000000,1.000000,刺网
6998,6.439924e+06,73751.742806,6.393008e+06,6.393108e+06,-0.411900,1.160379,5.507950e+06,52480.096897,5.474771e+06,5.474991e+06,...,0.314366,0.033708,0.259144,0.335273,-0.383506,-1.127126,2.725277e+10,1.000000,1.000000,围网


In [49]:
train_header = gen_header(train=True)
fea_pd.to_csv('./data/train_feature.csv', header=train_header, index=False)
print('done!')

done!


In [51]:
train = pd.read_csv('./data/train_feature.csv')

In [52]:
train.head()

Unnamed: 0,x_mean,x_std,x_quantile,x_median,x_kurt,x_skew,y_mean,y_std,y_quantile,y_median,...,wei_dist_mean,wei_dist_std,wei_dist_quantile,wei_dist_median,wei_dist_kurt,wei_dist_skew,s,p_weiwang,p_tuowang,type
0,6247895.0,4438.069089,6246627.0,6246627.0,13.235139,3.712603,5239679.0,5939.181795,5241151.0,5241260.0,...,0.186612,0.040645,0.175043,0.175043,11.122571,3.532651,872326700.0,1.0,1.0,拖网
1,6173433.0,12250.695539,6165485.0,6165586.0,-0.359746,1.152952,5197881.0,6039.038082,5187879.0,5201786.0,...,0.104653,0.070213,0.058759,0.059132,-0.532114,1.089043,709815300.0,1.0,1.0,刺网
2,7053152.0,12171.034972,7028056.0,7057604.0,0.108375,-1.241603,6089065.0,19760.522701,6061634.0,6089506.0,...,0.690907,0.094144,0.566603,0.739652,-0.163807,-0.66211,3587348000.0,0.911846,0.129477,围网
3,6711096.0,60764.037193,6658979.0,6663841.0,-0.927983,0.631317,5459586.0,25339.521511,5433949.0,5453345.0,...,0.874476,0.142671,0.754016,0.804881,0.763413,1.095577,16044090000.0,0.911765,0.651961,围网
4,6194958.0,12947.725471,6178915.0,6199598.0,-0.727048,-0.612727,5134912.0,24345.461517,5113110.0,5128339.0,...,0.461491,0.134213,0.162758,0.528028,0.299261,-1.231053,4470275000.0,1.0,1.0,拖网


In [53]:
test_feature = get_feature('./data/test', False)

100%|██████████| 2000/2000 [02:14<00:00, 14.83it/s]


In [54]:
test_feature

[[6070564.371428865,
  22312.849844347467,
  6045547.739336461,
  6063273.490824191,
  -1.207989654503558,
  0.41677410340449383,
  5056962.798502237,
  37910.562888120476,
  5015706.683231451,
  5041639.285481537,
  -1.2981016286874831,
  0.5609631235095911,
  3.18987714987715,
  2.3566551262086386,
  0.0,
  3.78,
  0.40790842265165583,
  0.35584994305549056,
  117.51597051597052,
  106.58868097411697,
  0.0,
  96.0,
  -1.237003379897983,
  0.32169412383618784,
  0.05151207931485474,
  0.038182292651769936,
  0.0022183947161225,
  0.05554445591274365,
  -1.355561272134165,
  -0.013937041647089123,
  0.36062688244594865,
  0.10294928175080725,
  0.2283504647564063,
  0.3720529359469755,
  -1.3636833551193592,
  0.0048410649500290875,
  7806696660.761701,
  1.0,
  1.0,
  8031],
 [6028782.033753841,
  16224.556857702504,
  6003303.243534199,
  6035423.698097376,
  4.984860846750333,
  -2.4307360078839437,
  5067789.6068957,
  45625.22531810108,
  4967409.449987763,
  5090446.067647391,
 

In [55]:
test_fea_pd = pd.DataFrame(test_feature)
test_fea_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,6.070564e+06,22312.849844,6.045548e+06,6.063273e+06,-1.207990,0.416774,5.056963e+06,37910.562888,5.015707e+06,5.041639e+06,...,0.360627,0.102949,0.228350,0.372053,-1.363683,0.004841,7.806697e+09,1.0,1.000000,8031
1,6.028782e+06,16224.556858,6.003303e+06,6.035424e+06,4.984861,-2.430736,5.067790e+06,45625.225318,4.967409e+06,5.090446e+06,...,0.162469,0.149606,0.085759,0.085759,0.655856,1.566754,1.107011e+10,1.0,1.000000,8325
2,6.055753e+06,15157.384678,6.037212e+06,6.054560e+06,1.205639,0.781970,5.033935e+06,21864.626641,5.009458e+06,5.035167e+06,...,0.409637,0.075868,0.307247,0.435439,-0.931043,-0.488762,8.322459e+09,1.0,1.000000,7660
3,6.402757e+06,106.714784,6.402502e+06,6.402799e+06,2.545992,-2.097589,5.445714e+06,75.933083,5.445523e+06,5.445744e+06,...,0.171626,0.000086,0.171607,0.171607,68.796099,7.228130,1.645395e+05,1.0,1.000000,8408
4,6.908297e+06,40990.096129,6.845605e+06,6.929448e+06,-1.223294,-0.376982,5.720763e+06,43777.382740,5.655709e+06,5.721919e+06,...,0.423563,0.162224,0.165268,0.484435,0.207470,-0.378403,2.108038e+10,1.0,1.000000,7743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,6.616165e+06,9461.075654,6.604791e+06,6.613218e+06,-0.446165,0.586986,5.709167e+06,55302.980540,5.636474e+06,5.699424e+06,...,0.574415,0.218869,0.305563,0.544038,-1.058879,0.403696,6.856166e+09,1.0,0.696658,8141
1996,6.205525e+06,8678.835100,6.194464e+06,6.205432e+06,0.692123,-0.387223,5.156691e+06,10767.051801,5.142127e+06,5.155810e+06,...,0.402779,0.022116,0.361927,0.407039,-0.256057,-0.802239,2.066108e+09,1.0,1.000000,8121
1997,6.330084e+06,75348.734658,6.246326e+06,6.361016e+06,-1.873226,-0.087818,5.262775e+06,22770.186452,5.241481e+06,5.257791e+06,...,0.379350,0.182781,0.172974,0.488357,-1.874917,-0.132352,1.280690e+10,1.0,1.000000,8575
1998,6.246525e+06,7.312907,6.246525e+06,6.246525e+06,187.478794,-13.729242,5.241151e+06,0.090316,5.241151e+06,5.241151e+06,...,0.175155,0.000027,0.175157,0.175157,187.478794,-13.729242,1.258218e+02,1.0,1.000000,8717


In [56]:
test_header = gen_header(train=False)
test_fea_pd.to_csv('./data/test_feature.csv', header=test_header, index=False)
print('done!')

done!


In [57]:
test_read = pd.read_csv('./data/test_feature.csv')

In [58]:
print(len(test_read['p_weiwang'] == 1))

2000


In [59]:
print(len(test_read['p_tuowang'] == 1))

2000
