In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import sys
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
def fetch_data_train(path='DATA'):
    data = pd.read_csv(os.path.join(path, 'train.csv'))
    data_positive = data.loc[data['target'] == 1][:80000]
    data_negative = data.loc[data['target'] == 0][:80000]
    
    data = data_positive.append(data_negative, ignore_index=True, sort=False)
    data = data.sample(frac=1).reset_index(drop=True)
    
    X = data.drop(['qid', 'target'], axis=1)
    y = data['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

def fetch_data_test(path='DATA'):
    train = pd.read_csv(os.path.join(path, 'train.csv'))
    test = pd.read_csv(os.path.join(path, 'test.csv'))
    
    X_train = train.drop(['qid', 'target'], axis=1).values
    y_train = train['target'].values
    X_test = train.drop(['qid'], axis=1).values
    
    return X_train, X_test, y_train

In [4]:
X_train, X_test, y_train, y_test = fetch_data_train()

In [5]:
X_train.head()

Unnamed: 0,question_text
38442,How can many Floridians deny the existence of ...
7256,Where is the π?
7041,How can I compose piano music?
52765,Why can't all people who go to space be called...
35343,What taxes are paid when dining at restaurants...


## Text Generation using RNN

In [6]:
text = ''
for i, question in X_train.iterrows():
    text += question.question_text + '\n'

In [7]:
text.split('\n')[0]

'How can many Floridians deny the existence of climate change when Miami is literally flooding because of it?'

In [8]:
vocab = sorted(set(text))
print("There are {} unique symbols in the text".format(len(vocab)))

There are 501 unique symbols in the text


In [11]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [12]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  '\x10':   1,
  ' ' :   2,
  '!' :   3,
  '"' :   4,
  '#' :   5,
  '$' :   6,
  '%' :   7,
  '&' :   8,
  "'" :   9,
  '(' :  10,
  ')' :  11,
  '*' :  12,
  '+' :  13,
  ',' :  14,
  '-' :  15,
  '.' :  16,
  '/' :  17,
  '0' :  18,
  '1' :  19,
  ...
}


In [14]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:12]), text_as_int[:12]))

'How can many' ---- characters mapped to int ---- > [42 81 89  2 69 67 80  2 79 67 80 91]


In [27]:
seq_length = 16
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(12):
    print(idx2char[i.numpy()])

H
o
w
 
c
a
n
 
m
a
n
y


In [28]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'How can many Flor'
'idians deny the e'
'xistence of clima'
'te change when Mi'
'ami is literally '


In [29]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [30]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'How can many Flo'
Target data: 'ow can many Flor'


In [31]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 42 ('H')
  expected output: 81 ('o')
Step    1
  input: 81 ('o')
  expected output: 89 ('w')
Step    2
  input: 89 ('w')
  expected output: 2 (' ')
Step    3
  input: 2 (' ')
  expected output: 69 ('c')
Step    4
  input: 69 ('c')
  expected output: 67 ('a')
