In [1]:
import tensorflow as tf
from tensorflow.core.example import example_pb2
import struct

In [2]:
import pandas as pd

In [3]:
whole_dataset = pd.read_csv('../wh_data/wikihowAll.csv')

In [4]:
split = 0.7
num_train = int(len(whole_dataset) * split)

In [5]:
train = whole_dataset.loc[:num_train]
val = whole_dataset.loc[num_train:]

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150756 entries, 0 to 150755
Data columns (total 3 columns):
headline    150126 non-null object
title       150755 non-null object
text        149930 non-null object
dtypes: object(3)
memory usage: 3.5+ MB


In [7]:
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64610 entries, 150755 to 215364
Data columns (total 3 columns):
headline    64422 non-null object
title       64610 non-null object
text        64365 non-null object
dtypes: object(3)
memory usage: 1.5+ MB


In [8]:
val.head()

Unnamed: 0,headline,title,text
150755,"\nGo to the AKC marketplace website.,\nSearch ...",How to Find an AKC Puppy1,The AKC marketplace is an online site that co...
150756,"\nInquire at local dog shelters.,\nAsk your ve...",How to Get Help Housebreaking a Dog on a Limit...,Dog shelters are generally kept running by vo...
150757,"\nUse reward-based training.,\nStart training ...",How to Potty Train a Chihuahua1,Dogs make a positive effort to repeat behavio...
150758,\nConsider your lifestyle when choosing a pupp...,How to Find an AKC Puppy2,Many people will get any dog because they are...
150759,"\nContact breeders for future litters.,\nConta...",How to Find an AKC Puppy3,The AKC marketplace provides info for breeder...


In [9]:
TRAIN_PREFIX = "../wh_data/train/train_"
VAL_PREFIX = "../wh_data/val/val_"
CHUNK_SIZE = 1000

SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

def pre_process_frame(frame):
    frame['headline'] = frame['headline'].str.replace('\n', '')
    frame['headline'] = frame['headline'].str.replace('.', ' . ')
    frame['headline'] = frame['headline'].str.replace(',', ' , ')
    frame['headline'] = frame['headline'].str.replace('?', ' ? ')
    frame['headline'] = frame['headline'].str.replace('!', ' ! ')
    frame['headline'] = SENTENCE_START + frame['headline'].str.lower() + SENTENCE_END
    
    frame['text'] = frame['text'].str.replace('\n', '')
    frame['text'] = frame['text'].str.replace('.', ' . ')
    frame['text'] = frame['text'].str.replace(',', ' , ')
    frame['text'] = frame['text'].str.replace('?', ' ? ')
    frame['text'] = frame['text'].str.replace('!', ' ! ')
    frame['text'] = frame['text'].str.lower()
    return frame

def row_to_ex(text, headline):
    tf_example = example_pb2.Example()
    tf_example.features.feature['article'].bytes_list.value.extend([text.encode()])
    tf_example.features.feature['abstract'].bytes_list.value.extend([headline.encode()])
    tf_example_str = tf_example.SerializeToString()
    str_len = len(tf_example_str)
    return tf_example_str, str_len

def write_chunk_to_file(chunk, file):
    with open(file, 'wb') as writer:
        assert len(chunk) != 0
        for idx, row in chunk.iterrows():
            tf_example_str, str_len = row_to_ex(str(row['text']), str(row['headline']))
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))
    
def write_frame_to_file(frame, prefix):
    frame = pre_process_frame(frame)
    start = 0
    end = CHUNK_SIZE
    count = 1
    assert len(frame != 0)
    while start < len(frame):
        end = min(end, len(frame))
        assert start != end
        chunk = frame[start:end]
        assert len(chunk) != 0
        write_chunk_to_file(chunk, prefix + str(count) + ".bin")
        count += 1
        start = end
        end += CHUNK_SIZE
                   

In [10]:
write_frame_to_file(val, VAL_PREFIX)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

In [11]:
val[:10]

Unnamed: 0,headline,title,text
150755,"<s>go to the akc marketplace website . , sear...",How to Find an AKC Puppy1,the akc marketplace is an online site that co...
150756,"<s>inquire at local dog shelters . , ask your...",How to Get Help Housebreaking a Dog on a Limit...,dog shelters are generally kept running by vo...
150757,"<s>use reward-based training . , start traini...",How to Potty Train a Chihuahua1,dogs make a positive effort to repeat behavio...
150758,<s>consider your lifestyle when choosing a pup...,How to Find an AKC Puppy2,many people will get any dog because they are...
150759,"<s>contact breeders for future litters . , co...",How to Find an AKC Puppy3,the akc marketplace provides info for breeder...
150760,"<s>introduce the collar . , try putting the c...",How to Lead Train a Dog or Puppy,"if the dog sniffs it or shows any curiosity ,..."
150761,<s>cut the velcro in half so that you have 2 l...,"How to Make a Belly Band for a Male Dog That ""...","; , , , , , ,"
150762,"<s>make it a family effort . , ask responsibl...",How to Get Help Housebreaking a Dog on a Limit...,"if this is a family dog , make sure everyone..."
150763,"<s>take the dog out often . , take the dog ou...",How to Potty Train a Chihuahua2,give the chihuahua (adult or puppy) plenty of...
150764,"<s>keep an eye on chihuahua in the house . , ...",How to Potty Train a Chihuahua3,"for speedy training , be vigilant and spot s..."
