In [9]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_text as tft

In [2]:
!ls datasets/ -alh

总用量 257M
drwxrwxr-x  2 zyt zyt 4.0K Nov  2 15:36 .
drwxrwxr-x 10 zyt zyt 4.0K Nov  9 22:11 ..
-rw-rw-r--  1 zyt zyt 5.9M Sep 28 11:01 df_test_corpus.tar.bz2
-rw-rw-r--  1 zyt zyt 6.5M Nov  9 17:43 df_test_line.tar.bz2
-rw-rw-r--  1 zyt zyt 106M Sep 28 11:01 df_train_corpus.tar.bz2
-rw-rw-r--  1 zyt zyt 126M Nov  9 17:43 df_train_line.tar.bz2
-rw-rw-r--  1 zyt zyt 6.4M Sep 28 11:01 df_valid_corpus.tar.bz2
-rw-rw-r--  1 zyt zyt 7.3M Nov  9 17:43 df_valid_line.tar.bz2


### 导入

In [3]:
save_path = 'datasets/'
df_train = pd.read_pickle(os.path.join(save_path, 'df_train_line.tar.bz2'))
df_test = pd.read_pickle(os.path.join(save_path, 'df_test_line.tar.bz2'))
df_valid = pd.read_pickle(os.path.join(save_path, 'df_valid_line.tar.bz2'))

### 编码

验证集编码

In [4]:
from utils.CodeTokenizer import CodeSplitTokenizer
tokenizer = CodeSplitTokenizer('./vocabs/split_keyword_vocab50000.txt')
valid_data = tokenizer.from_lines_to_token_input(df_valid['data'])
valid_data

array([[ 248,  448,   69, ...,    0,    0,    0],
       [  17,   14,  115, ...,    0,    0,    0],
       [ 341, 4332,    9, ...,    0,    0,    0],
       ...,
       [2282,  955,   11, ...,    0,    0,    0],
       [ 269,   11, 1379, ...,    0,    0,    0],
       [1162, 1234,    5, ...,    0,    0,    0]], dtype=int32)

In [6]:
len(df_valid), len(valid_data)
# token的过程可能减少数据

(557254, 494331)

In [7]:
def make_ds(lines, labels):
    ds = tf.data.Dataset.from_tensor_slices((lines, labels))
    ds = ds.shuffle(1000000).repeat()
    return ds

In [10]:
val_code = tokenizer.from_lines_to_token_input(df_valid[df_valid['label']==0]['data'])
val_docs = tokenizer.from_lines_to_token_input(df_valid[df_valid['label']==1]['data'])

In [11]:
len(val_code), len(val_docs)

(315229, 179102)

In [14]:
BATCH_SIZE=32

In [27]:
val_code_ds = make_ds(val_code, np.zeros((len(val_code))))
val_docs_ds = make_ds(val_docs, np.ones((len(val_docs))))
val_ds = tf.data.experimental.sample_from_datasets([val_code_ds, val_docs_ds], weights=[0.5,0.5])
val_ds = val_ds.batch(BATCH_SIZE).prefetch(2)

In [28]:
# val_ds = tf.data.Dataset.from_tensor_slices((valid_data, df_valid['label']))
for line, label in val_ds.take(1):
    print(line.numpy())
    print(label.numpy())
    print(label.numpy().mean())

[[ 1974  1589    11  1589    10   337    82    11     3    10    80  1046
     11     3    10    96   834    11     3    10     0     0     0     0
      0     0     0     0     0     0]
 [    1   132   131  3708   369   132    29    49  7279   839    32  1081
   4913     9     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [   12  3431    12   293     7    56     8     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [   60   569  2763 43797   674     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [ 6659    32    32   131    32    32  1332   448 12181 29477  1999  6039
    729   581    19     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [  631  1332   415  3638  1332   190  6370  1999 12328     9    

其他集合编码

In [29]:
train_data = tokenizer.from_lines_to_token_input(df_train['data'])
print(len(df_train), len(train_data))
test_data = tokenizer.from_lines_to_token_input(df_test['data'])
print(len(df_test), len(test_data))

KeyboardInterrupt: 

In [30]:
train_code = tokenizer.from_lines_to_token_input(df_train[df_train['label']==0]['data'])
train_docs = tokenizer.from_lines_to_token_input(df_train[df_train['label']==1]['data'])
test_code = tokenizer.from_lines_to_token_input(df_test[df_test['label']==0]['data'])
test_docs = tokenizer.from_lines_to_token_input(df_test[df_test['label']==1]['data'])

In [31]:
len(train_code), len(train_docs), len(test_code), len(test_docs)

(5398611, 2756143, 287562, 154148)

In [32]:
train_code_ds = make_ds(train_code, np.zeros((len(train_code))))
train_docs_ds = make_ds(train_docs, np.ones((len(train_docs))))
train_ds = tf.data.experimental.sample_from_datasets([train_code_ds, train_docs_ds], weights=[0.5,0.5])
train_ds = train_ds.batch(BATCH_SIZE).prefetch(2)

# val_ds = tf.data.Dataset.from_tensor_slices((valid_data, df_valid['label']))
for line, label in train_ds.take(1):
    print(line.numpy())
    print(label.numpy())
    print(label.numpy().mean())

[[ 3468    43  1402  7279    24    39  2494  6365    35    28   691    19
   1332  1419     9     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    4    49    50     9   493     7   166   560     9   454    14    45
     15     8   147     1    33    50     9   493     7   166   560     9
    454    14    45    15     8     8]
 [   16   634    24    19   129    49   303   691   107     4    12     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [   12   292   143    12  1186   369  1332   686   997     3    32     3
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    1   714     7  2870  1512  1372     8     1    19   325    60  1118
   9238     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [   16    97     9   143   150   107     3    12     0     0    

In [33]:
test_code_ds = make_ds(test_code, np.zeros((len(test_code))))
test_docs_ds = make_ds(test_docs, np.ones((len(test_docs))))
test_ds = tf.data.experimental.sample_from_datasets([test_code_ds, test_docs_ds], weights=[0.5,0.5])
test_ds = test_ds.batch(BATCH_SIZE).prefetch(2)

# val_ds = tf.data.Dataset.from_tensor_slices((valid_data, df_valid['label']))
for line, label in train_ds.take(1):
    print(line.numpy())
    print(label.numpy())
    print(label.numpy().mean())

[[ 1313  3137     9 14814 18408     7   166    29  1726    10   104   907
      8     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [   61   159    12     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [   17    13     9   367   134  2717   122   458    69     7   134    31
     10    55    47     8     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [   65 29477  1999  1295    60   163   132   902    49    53 29477  1999
   1295    60     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    5   287    26     4  1736     5     5   287    26     4   438     6
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [ 6960   170     1   132  5656  3254    90   227   140     9   4

### 转化

In [12]:
ds_valid = tf.data.Dataset.from_tensor_slices((df_valid['data'], df_valid['label']))
for line, label in ds_valid.take(5):
    print('line: {}, label: {}'.format(line, label))

line: b'remove directory info only', label: 1.0
line: b'return [cls.of_structs(x.value, y.value)', label: 0.0
line: b'function below.', label: 1.0
line: b'elif I_ord == 2:', label: 0.0
line: b'for t in range(len(self.agent))]', label: 0.0


In [14]:
ds_train = tf.data.Dataset.from_tensor_slices((df_train['data'], df_train['label']))
ds_test = tf.data.Dataset.from_tensor_slices((df_test['data'], df_test['label']))

### 批

In [21]:
BATCH_SIZE = 32

### 过采样

In [None]:
code_df = df_train[df_train['label']==0]
docs_df = df_train[df_train['label']==1]
len(code_df), len(docs_df)

In [20]:
def make_ds(lines, labels):
    ds = tf.data.Dataset.from_tensor_slices((lines, labels))
    ds = ds.shuffle(1000000).repeat()
    return ds

code_ds = make_ds(code_df['data'], code_df['label'])
docs_ds = make_ds(docs_df['data'], docs_df['label'])
for line, label in code_ds.take(1):
    print(line.numpy())
    print(label.numpy())
for line, label in docs_ds.take(1):
    print(line.numpy())
    print(label.numpy())

b'else:'
0.0
b'- redis'
1.0


In [23]:
resampled_ds = tf.data.experimental.sample_from_datasets([code_ds, docs_ds], weights=[0.5,0.5])
resampled_ds = resampled_ds.batch(BATCH_SIZE).prefetch(2)
for line, label in resampled_ds.take(1):
    print(line.numpy())
    print(label.numpy())

[b"'self' and 'other'" b'if idx:'
 b'Redirect call to our setup() tap function.'
 b'Since we need to use this sometimes'
 b'dst_dir   -- Optional destination directory to write files to.  If not'
 b'status_delta[x] -= 1' b'define MAVLINK_MSG_ID_${id}_CRC ${crc_extra}'
 b'----------' b'excs.append(straceback())' b'build payload'
 b"'equipotential field lines, in turn alters '" b'else:'
 b'self._in_queue.put(inPacket)' b'err_logfile_writer.close()'
 b'Create the SCOOP module arguments parser.'
 b'tail = [x for x in services if x is not service]'
 b"all cases. Finally, datagrepper turns the 'timestamp' field into a float, but it"
 b"raise TypeError('Impossible to initialise the object from an object of type {}'.format(type(f)))"
 b'.. note:: If ``package_name`` is passed and refers to a namespace'
 b'A parallel algorithm for computing the thickness of 3D objects,'
 b'if mmi_mean[i] <= 7.0:' b'A block to declare self variables'
 b'TODO: maybe we need an explicit marker for "end of stream"'

In [26]:
for _, label in resampled_ds.take(1):
    print(label.numpy().mean())

0.53125


### 训

In [37]:
len(tokenizer.vocab)

50007

In [38]:
def get_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=50007, input_length=30, output_dim=200),
        tf.keras.layers.LSTM(200),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(20, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                 metrics=['accuracy'])
    return model

In [None]:
model = get_model()
model.fit(train_ds, epochs=5, validation_data=val_ds)

Epoch 1/5
 205333/Unknown - 38419s 187ms/step - loss: 0.5648 - accuracy: 0.8683