In [31]:
import json
import pandas as pd
from nltk.tokenize import word_tokenize

In [32]:
# Path to the JSONL dataset file
jsonl_file_path = "NQ-open.dev.jsonl"

# Initialize an empty list to store the dataset
dataset = []

# Read the JSONL file line by line
with open(jsonl_file_path, "r", encoding="utf-8") as file:
    for line in file:
        # Parse each line as a JSON object
        data = json.loads(line)
        # Append the JSON object to the dataset list
        dataset.append(data)

In [33]:
print(dataset[:5])

[{'question': 'when was the last time anyone was on the moon', 'answer': ['14 December 1972 UTC', 'December 1972']}, {'question': "who wrote he ain't heavy he's my brother lyrics", 'answer': ['Bobby Scott', 'Bob Russell']}, {'question': 'how many seasons of the bastard executioner are there', 'answer': ['one', 'one season']}, {'question': 'when did the eagles win last super bowl', 'answer': ['2017']}, {'question': "who won last year's ncaa women's basketball", 'answer': ['South Carolina']}]


In [34]:
# Convert it to a dataframe
df = pd.DataFrame(dataset)

In [35]:
df.head(10)

Unnamed: 0,question,answer
0,when was the last time anyone was on the moon,"[14 December 1972 UTC, December 1972]"
1,who wrote he ain't heavy he's my brother lyrics,"[Bobby Scott, Bob Russell]"
2,how many seasons of the bastard executioner ar...,"[one, one season]"
3,when did the eagles win last super bowl,[2017]
4,who won last year's ncaa women's basketball,[South Carolina]
5,when did the isle of wight become an island,[During the last Ice Age]
6,love yourself by justin bieber is about who,[Rihanna]
7,who was the ruler of england in 1616,[James I]
8,what is the hot coffee mod in san andreas,[a normally inaccessible mini-game]
9,what is the maximum data rate for the 802.11a ...,[54 Mbit/s]


In [44]:
df.shape

(3610, 2)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3610 entries, 0 to 3609
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  3610 non-null   object
 1   answer    3610 non-null   object
dtypes: object(2)
memory usage: 56.5+ KB


In [46]:
df.describe()

Unnamed: 0,question,answer
count,3610,3610
unique,3610,3403
top,when was the last time anyone was on the moon,[2018]
freq,1,13


In [47]:
df["question"].value_counts()

question
when was the last time anyone was on the moon      1
what is the name of a camel with 2 humps           1
when did the first pokemon game come out           1
who plays jill bigelow in line of duty             1
what do you say when you win bingo                 1
                                                  ..
who wrote the theme song for mission impossible    1
who wrote the theme to last of the mohicans        1
when did day light savings start in the us         1
what does the m number mean on a pint glass        1
what is the meaning of the name comanche           1
Name: count, Length: 3610, dtype: int64

In [48]:
df["answer"].value_counts()

answer
[2018]                                         13
[2017]                                          8
[1989]                                          6
[10]                                            6
[four]                                          6
                                               ..
[in the 1970s]                                  1
[Return of the Jedi, In Return of the Jedi]     1
[Since 1940, 1940]                              1
[southern Anatolia, in southern Anatolia]       1
[enemy]                                         1
Name: count, Length: 3403, dtype: int64

In [49]:
# Exploring the Question and Answer columns
def questions_answers_stats(df):
    # Number of examples
    num_examples = len(df)
    
    # Question statistics
    question_lengths = df['question'].apply(lambda x: len(word_tokenize(x)))
    min_question_length = min(question_lengths)
    max_question_length = max(question_lengths)
    avg_question_length = question_lengths.mean()
    
    # Answer statistics
    all_answers = [item for sublist in df['answer'] for item in sublist]
    answer_lengths = [len(word_tokenize(answer)) for answer in all_answers if answer]
    min_answer_length = min(answer_lengths)
    max_answer_length = max(answer_lengths)
    avg_answer_length = sum(answer_lengths) / len(answer_lengths) if answer_lengths else 0
    
    # Print statistics
    print("Number of examples:", num_examples)
    print("Question Statistics:")
    print("  Minimum question length:", min_question_length)
    print("  Maximum question length:", max_question_length)
    print("  Average question length:", avg_question_length)
    print("Answer Statistics:")
    print("  Minimum answer length:", min_answer_length)
    print("  Maximum answer length:", max_answer_length)
    print("  Average answer length:", avg_answer_length)

In [50]:
# Compute basic statistics
print("Basic Statistics:")
questions_answers_stats(df)

Basic Statistics:
Number of examples: 3610
Question Statistics:
  Minimum question length: 4
  Maximum question length: 22
  Average question length: 9.229085872576178
Answer Statistics:
  Minimum answer length: 1
  Maximum answer length: 6
  Average answer length: 2.280893682588598
