In [2]:
import pandas as pd 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [3]:
imdb = pd.read_csv("./data/IMDB_dataset.csv")
imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### word-level tokenize

In [4]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(list(imdb["review"]))
temp = tokenizer.texts_to_sequences(list(imdb["review"]))
word_engineered = pad_sequences(temp, maxlen=200, padding='post')

In [5]:
word_engineered

array([[   32,  4636,  2468, ...,   125,  4103,   486],
       [    3,   393,   120, ...,     0,     0,     0],
       [   10,   190,    11, ...,     0,     0,     0],
       ...,
       [    1, 17184,   519, ..., 22840,     2,  6050],
       [   16, 11491,    20, ...,    67,   739,    42],
       [   54,    27,  5892, ...,     0,     0,     0]], dtype=int32)

In [6]:
word_engineered[2]

array([   10,   190,    11,    13,     3,   393,    95,     5,  1155,
          55,    20,     3,    96,   879,  1494,  2657,  1241,     8,
           1,   903, 16061,   769,     2,   147,     3,   649,  2333,
         202,     1,   111,     6,  4086,    18,     1,   410,     6,
        1915,     2,     1,   102,    23,  1485,    57,     1,    69,
        6126,  6686,  1574,   494,   136,    47,   200,    26,   680,
          50,    33,   959,    11,     6,    21,  1026,   220,   230,
        2917,  5196,    10,   190,     9,    13,  3059,    12,  2971,
        2050,     6,   130,  1403,     8,  1139,     4,     1,   396,
         106,     4,   176,    25,  2079,     5,   112,     7,     7,
          11,    13,     1,    88,   482,  1451,    30,    27,     4,
       19396,  1317,     8,   153,  3024,    10,   131,     3,  2121,
         136,   198,   110,    75,  1508,    16,  8927, 38711,     8,
          11,    59,  1323,     5,  1260,   175,    40,  1257,  1428,
           2,  5078,

### char-level tokenize 

In [7]:
char_tokenizer = Tokenizer(char_level=True, oov_token="<UNK>")
char_tokenizer.fit_on_texts(list(imdb["review"]))
temp = char_tokenizer.texts_to_sequences(list(imdb["review"]))
char_engineered = pad_sequences(temp, maxlen=200, padding="post")

### write into tfrecord format 

tfrecord 라는 것은 데이터(=tensor)를 저장하기 위한 고유의 텐서 흐름 이진형식이다.       


In [39]:

tfsample_path = "./data/number.tfrecord"
with tf.io.TFRecordWriter(tfsample_path) as file_writer:
    for _ in range(4):
        x, y = np.random.random(), np.random.random()
        record_bytes = tf.train.Example(features=tf.train.Features(feature={
            "x": tf.train.Feature(float_list=tf.train.FloatList(value=[x])),
            "y": tf.train.Feature(float_list=tf.train.FloatList(value=[y])),
        })).SerializeToString()
    
        file_writer.write(record_bytes)

In [44]:
# tfrecord 읽어보기 
# novdov.github.io 참고 


num_sample = tf.data.TFRecordDataset(tfsample_path)
for aRecord in num_sample:
    example = tf.train.Example()
    example.ParseFromString(aRecord.numpy())
    print(example)

features {
  feature {
    key: "x"
    value {
      float_list {
        value: 0.6362333297729492
      }
    }
  }
  feature {
    key: "y"
    value {
      float_list {
        value: 0.4968804121017456
      }
    }
  }
}

features {
  feature {
    key: "x"
    value {
      float_list {
        value: 0.0618320032954216
      }
    }
  }
  feature {
    key: "y"
    value {
      float_list {
        value: 0.9925841093063354
      }
    }
  }
}

features {
  feature {
    key: "x"
    value {
      float_list {
        value: 0.3288733661174774
      }
    }
  }
  feature {
    key: "y"
    value {
      float_list {
        value: 0.5355156660079956
      }
    }
  }
}

features {
  feature {
    key: "x"
    value {
      float_list {
        value: 0.5774037837982178
      }
    }
  }
  feature {
    key: "y"
    value {
      float_list {
        value: 0.5665009021759033
      }
    }
  }
}



In [9]:
labels = [1 if x == "positive" else 0 for x in imdb["sentiment"]]

example_path = "./data/imdb_tfrecord.tfrecord"
with tf.io.TFRecordWriter(example_path) as file_writer:
    for idx in range(word_engineered.shape[0]):
        wordSeq = word_engineered[idx]
        sentence = imdb["review"][idx]
        label = labels[idx]
        
        record_byte = tf.train.Example(features = tf.train.Features(feature={
            "wordSeq": tf.train.Feature(int64_list = tf.train.Int64List(value=wordSeq)), 
            "sentence": tf.train.Feature(bytes_list = tf.train.BytesList(value=[sentence])), 
            "label": tf.train.Feature(int64_list = tf.train.Int64List(value=[label]))
        })).SerializeToString()
        file_writer.write(record_byte)
        

TypeError: "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. Th has type str, but expected one of: bytes

In [12]:
type(imdb["review"][2])

str