In [5]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
dataset_path = r'datasets/stanford_qa'

train_file = 'train-v1.1.json'
dev_file = 'dev-v1.1.json'

In [7]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [10]:
input_file_path = os.path.join(dataset_path, train_file)
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

input_file_path = os.path.join(dataset_path, dev_file)
record_path = ['data','paragraphs','qas','answers']
verbose = 0
dev = squad_json_to_dataframe_dev(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...


  from ipykernel import kernelapp as app
  app.launch_new_instance()


shape of the dataframe is (87599, 6)
Done
Reading the json file
processing...




shape of the dataframe is (10570, 5)
Done




In [11]:
train.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


In [12]:
train.shape

(87599, 6)

In [13]:
import random

In [20]:
def get_random_qa_pair(df):
    rand_idx = random.randint(0, df.shape[0])
    question = df['question'].values[rand_idx]
    answer = df['text'].values[rand_idx]
    return question, answer
    

In [21]:
def print_qa(question, answer):
    print("The question is:")
    print(question)
    print()
    print("And the answer is:")
    print(answer)
    print()

In [22]:
q, a = get_random_qa_pair(train)

In [23]:
print_qa(q, a)

The question is:
What was the name of the 360's video store service?

And the answer is:
Xbox Video Marketplace



In [None]:
X, y = 

In [24]:
tokenizer = Tokenizer()

In [25]:
tokenizer.fit_on_texts(x_train)

NameError: name 'toknizer' is not defined

In [29]:
dev['answers']

0        [{'answer_start': 177, 'text': 'Denver Broncos...
1        [{'answer_start': 249, 'text': 'Carolina Panth...
2        [{'answer_start': 403, 'text': 'Santa Clara, C...
3        [{'answer_start': 177, 'text': 'Denver Broncos...
4        [{'answer_start': 488, 'text': 'gold'}, {'answ...
                               ...                        
10565    [{'answer_start': 82, 'text': 'kilogram-force'...
10566    [{'answer_start': 114, 'text': 'kilopond'}, {'...
10567    [{'answer_start': 274, 'text': 'slug'}, {'answ...
10568    [{'answer_start': 712, 'text': 'kip'}, {'answe...
10569    [{'answer_start': 665, 'text': 'sthène'}, {'an...
Name: answers, Length: 10570, dtype: object

In [None]:
x_train, x_test, y_train, y_test = train_test_split()

In [34]:
dev['answers'].values[25]

[{'answer_start': 116, 'text': '2015'},
 {'answer_start': 112, 'text': 'the 2015 season'},
 {'answer_start': 116, 'text': '2015'}]

In [None]:
tokenizer.fit_on_texts()