# Convert SemEval-2017 Dataset form JSONLines to TFRecord

In [7]:
import pandas as pd
import tensorflow as tf

## Loading Data to Pandas
This code block loads data into a pandas dataframe from a JSONLines file and filters out data points that have no text.

In [8]:
data = pd.read_json('../data/SemEval2017.json', lines=True)
data = data[data['error'].isnull()]
train_data = data[data['partition'] == "train"]
val_data = data[data['partition'] == "dev"]
test_data = data[data['partition'] == "test"]

## Creating TFRecord ProtoBuf

In [9]:
def record_to_example(record) -> tf.train.Example:
    return tf.train.Example(features=tf.train.Features(feature={
        'text': tf.train.Feature(bytes_list=tf.train.BytesList(value=[record['text'].encode()])),
        'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[record['sentiment'].encode()]))
    }))

## Converting Rows in DataFrame to TF Example and Writing to TFRecord File
### Train Data

In [10]:
with tf.io.TFRecordWriter(path="../data/SemEval2017/train.tfrecords") as writer:
    for idx, row in train_data.iterrows():
        example = record_to_example(row)
        writer.write(example.SerializeToString())

### Validation Data

In [11]:
with tf.io.TFRecordWriter(path="../data/SemEval2017/validation.tfrecords") as writer:
    for idx, row in val_data.iterrows():
        example = record_to_example(row)
        writer.write(example.SerializeToString())

### Test Data

In [12]:
with tf.io.TFRecordWriter(path="../data/SemEval2017/test.tfrecords") as writer:
    for idx, row in test_data.iterrows():
        example = record_to_example(row)
        writer.write(example.SerializeToString())