# NLP Project Q/A System

Author: Taniya Riar 

In [1]:
#importing necessary libraries
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tag import pos_tag
from nltk import ne_chunk
import glob
import math
import os
from nltk.stem import WordNetLemmatizer
import json
import codecs
from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import spacy
from heapq import nlargest 
from nltk.corpus import wordnet as wn
import string
import re
nlp = spacy.load('en_core_web_lg')

# TASK 1

In [2]:
def extract_words(text,fname=""):
    stop_words = set(stopwords.words('english'))
    words = []
    doc = nlp(text)
    for token in doc:
        words.append(token.text)
    non_stop_sentence = [w for w in words if not w in stop_words] 
    return(non_stop_sentence)

In [3]:
def extract_lemma(text):
    lemma_words =[]
    doc = nlp(text)
    for token in doc:
        lemma_words.append(token.lemma_)
    return(lemma_words)

In [4]:
def extract_pos(text):
    POS_list = []
    doc = nlp(text)
    for token in doc:
        POS_list.append(str(token.text+'_'+token.tag_))
    return POS_list

In [5]:
def extract_ne(text):
    NER_list = []
    doc = nlp(text)
    for token in doc.ents:
            NER_list.append(token.text+"_"+token.label_)
    return(NER_list)

In [6]:
def extract_parse_tree(text):
    parse_tree = []
    doc =nlp(text)
    for token in doc:
        parse_tree.append(str(token.text+"_"+token.dep_))
    return(parse_tree)

In [7]:
#implement relation extraction from wordnet
def extract_relations(word_score):
    word_relations_feature = {}
    for key in word_score:
        syn = wn.synsets(key)
        r_j = []
        if len(syn) != 0:
            for s in syn:
                synonyms = s.lemma_names()
                definition = s.definition()
                hyponyms = sorted([lemma.name() for synset in s.hyponyms() for lemma in synset.lemmas()])
                hypernyms = [lemma.name() for synset in  s.hypernyms() for lemma in synset.lemmas()]
                meronyms = s.part_meronyms()
                holonyms = s.part_holonyms()
                r_j.append({'synset':s,'relations':{'def':definition,'syn':synonyms,'hypo':hyponyms,'hyper':hypernyms,'mero':meronyms,'holo':holonyms}})
        else:
            word_relations_feature[key] = []
        word_relations_feature[key] = r_j         
    return word_relations_feature    
    

In [8]:
from nltk.tokenize import sent_tokenize
import time
start = time.time()
path = input("Enter Wikipedia Doc directory")
if path == "":
    path = "<give some default path name>"
    
os.chdir(path)

data_text_1 = []
corpus_lemma =[]
for filename in glob.glob("*.txt"):
    sentences=[]
    doc = {}
    if filename != "MelindaGates.txt":
        f = codecs.open(filename,'r','utf-8-sig') 
    else:
        f = codecs.open(filename,'r','ISO-8859-1')
    file = f.read()
    print("File processing -------> ",filename)
    for l in sent_tokenize(file):
        if l.strip():
            
            w_j_obj= extract_words(l,filename)
            
            l_j_obj= extract_lemma(l)
            
            p_j_obj= extract_pos(l)
            
            ne_j_obj= extract_ne(l)
            
            dp_j_obj = extract_parse_tree(l)
            
            sentences.append({"sentence":l,"lemma":l_j_obj,"pos":p_j_obj,"ner": ne_j_obj,"parse_tree":dp_j_obj})
           
            corpus_lemma.extend(l_j_obj)
    doc = {'doc':filename,'sentences':sentences}
    print("Number of Lines in file ----> ",len(sent_tokenize(file)))
    data_text_1.append(doc)
end = time.time()
print(end - start)

Enter Wikipedia Doc directory
File processing ------->  AbrahamLincoln.txt
Number of Lines in file ---->  651
File processing ------->  Amazon_com.txt
Number of Lines in file ---->  252
File processing ------->  AppleInc.txt
Number of Lines in file ---->  490
File processing ------->  AT_T.txt
Number of Lines in file ---->  161
File processing ------->  Berkshire_Hathaway.txt
Number of Lines in file ---->  234
File processing ------->  China.txt
Number of Lines in file ---->  657
File processing ------->  CitiGroup.txt
Number of Lines in file ---->  331
File processing ------->  Dallas.txt
Number of Lines in file ---->  567
File processing ------->  ElonMusk.txt
Number of Lines in file ---->  166
File processing ------->  Europe.txt
Number of Lines in file ---->  471
File processing ------->  ExxonMobil.txt
Number of Lines in file ---->  307
File processing ------->  GeorgeWashington.txt
Number of Lines in file ---->  630
File processing ------->  IBM.txt
Number of Lines in file ----> 

In [9]:
try:
    os.mkdir('result_files')
except:
    print("Directory Already Exists")
    
with open('result_files/corpus.json', 'w') as corpusfile:  
    json.dump(data_text_1, corpusfile)
print("Corpus File path -> "+path+'/result_files/corpus.json')

Directory Already Exists
Corpus File path -> C:/Users/taniy/Desktop/Spring19/CS6320-NLP/Project/WikipediaArticles/WikipediaArticles/result_files/corpus.json


<b>Fetching relations for all the lemmas in the corpus

In [10]:
unique_lemmas = set(corpus_lemma)
print(len(unique_lemmas))
unique_word_relations = extract_relations(unique_lemmas)
print(len(unique_word_relations))

21949
21949


In [11]:
#TO show to TA
print(unique_word_relations['headquarters'])

[{'synset': Synset('headquarters.n.01'), 'relations': {'hypo': ['mukataa'], 'syn': ['headquarters', 'central_office', 'main_office', 'home_office', 'home_base'], 'hyper': ['office', 'business_office'], 'mero': [], 'def': '(usually plural) the office that serves as the administrative center of an enterprise', 'holo': []}}, {'synset': Synset('headquarters.n.02'), 'relations': {'hypo': ['GHQ', 'command_post', 'general_headquarters', 'guardhouse'], 'syn': ['headquarters', 'HQ', 'military_headquarters'], 'hyper': ['military_installation'], 'mero': [], 'def': 'the military installation from which a commander performs the functions of command', 'holo': []}}, {'synset': Synset('headquarters.n.03'), 'relations': {'hypo': ['ACE', 'ACLANT', 'Allied_Command_Atlantic', 'Allied_Command_Europe'], 'syn': ['headquarters'], 'hyper': ['military_unit', 'military_force', 'military_group', 'force'], 'mero': [Synset('headquarters_staff.n.01')], 'def': '(plural) a military unit consisting of a commander and t

In [22]:
len(data_text_1)

30

In [23]:
print(data_text_1[0]['sentences'][0])

{'sentence': 'Abraham Thomas Lincoln (February 12, 1809 – April 15, 1865) was an American statesman, politician, and lawyer who served as the 16th president of the United States from 1861 until his assassination in April 1865.', 'ner': ['Abraham Thomas Lincoln_PERSON', 'February 12, 1809_DATE', '– April 15, 1865_DATE', 'American_NORP', '16th_ORDINAL', 'the United States_GPE', '1861_DATE', 'April 1865_DATE'], 'pos': ['Abraham_NNP', 'Thomas_NNP', 'Lincoln_NNP', '(_-LRB-', 'February_NNP', '12_CD', ',_,', '1809_CD', '–_:', 'April_NNP', '15_CD', ',_,', '1865_CD', ')_-RRB-', 'was_VBD', 'an_DT', 'American_JJ', 'statesman_NN', ',_,', 'politician_NN', ',_,', 'and_CC', 'lawyer_NN', 'who_WP', 'served_VBD', 'as_IN', 'the_DT', '16th_JJ', 'president_NN', 'of_IN', 'the_DT', 'United_NNP', 'States_NNP', 'from_IN', '1861_CD', 'until_IN', 'his_PRP$', 'assassination_NN', 'in_IN', 'April_NNP', '1865_CD', '._.'], 'parse_tree': ['Abraham_compound', 'Thomas_compound', 'Lincoln_nsubj', '(_punct', 'February_npa

In [12]:
len(data_text_1[0]['sentences'])

651

# TF- IDF 
The idea of tf-df was taken from:

<b> S. Sareen and S. Sareen, “Process Text using TFIDF in Python,” Towards Data Science, 07-Aug-2018. [Online]. Available: https://towardsdatascience.com/tfidf-for-piece-of-text-in-python-43feccaa74f8. [Accessed: 08-May-2019]. <b>

tf = (frequency of term in the doc/total number of terms in the doc) 

idf = ln(total number of docs/number of docs with term in it)



In [1]:
def count_words(text):
    w = extract_words(text)
    return (len(w))

In [14]:
corpus_words = []
path = input("Enter Wikipedia Doc directory")
if path == "":
    path = "<give some default path name>"
    
os.chdir(path)

for filename in glob.glob("*.txt"):
    if filename != "MelindaGates.txt":
        f = codecs.open(filename,'r','utf-8-sig')
    else:
        f = codecs.open(filename,'r','ISO-8859-1')
    file = f.read()
    len_w = count_words(file)
    corpus_words.append({'doc':filename,'count':len_w,'text':file})

Enter Wikipedia Doc directory


In [15]:
def word_frequency(text,doc):
    freq_dict={}
    words = extract_words(text)
    for word in words:
        word = word.lower()
        if word in freq_dict:
            freq_dict[word] += 1
        else:
            freq_dict[word] = 1
    temp ={'doc' : doc , 'freq_dict': freq_dict}
    return temp

In [16]:
word_count_list = []
for i in corpus_words:
    l = word_frequency(i['text'],i['doc'])
    word_count_list.append(l)

In [17]:
def calculate_TF(corpus_words,word_count_list):
    TF_scores =[]
    for i in range(0,len(word_count_list)):
        id = word_count_list[i]['doc']
        for k in word_count_list[i]['freq_dict']:
            temp = {'doc': id,
                   'TF_score': word_count_list[i]['freq_dict'][k]/corpus_words[i]['count'],
                   'key':k}
            TF_scores.append(temp)
        i += 1
    return TF_scores

In [18]:
def calculate_IDF(corpus_words,word_count_list):
    IDF_scores =[]
    for dict in word_count_list:
        id = dict['doc']
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in word_count_list])
            temp = {'doc': id,'IDF_score': math.log(len(corpus_words)/count),'key' : k}
            
            IDF_scores.append(temp)
    return IDF_scores

In [19]:
tf = calculate_TF(corpus_words,word_count_list)

In [20]:
idf = calculate_IDF(corpus_words,word_count_list)

In [21]:
import pandas as pd
tf_dataframe = pd.DataFrame(tf)
idf_dataframe = pd.DataFrame(idf)

In [22]:
tf_dataframe.head(10)

Unnamed: 0,TF_score,doc,key
0,0.001553,AbrahamLincoln.txt,army
1,9.1e-05,AbrahamLincoln.txt,unsolicited
2,0.000274,AbrahamLincoln.txt,positions
3,9.1e-05,AbrahamLincoln.txt,keep
4,9.1e-05,AbrahamLincoln.txt,oppressed
5,0.000183,AbrahamLincoln.txt,prospects
6,9.1e-05,AbrahamLincoln.txt,borne
7,9.1e-05,AbrahamLincoln.txt,resigned
8,0.000365,AbrahamLincoln.txt,human
9,9.1e-05,AbrahamLincoln.txt,physical


In [23]:
idf_dataframe.head(10)

Unnamed: 0,IDF_score,doc,key
0,1.098612,AbrahamLincoln.txt,army
1,3.401197,AbrahamLincoln.txt,unsolicited
2,1.455287,AbrahamLincoln.txt,positions
3,0.836248,AbrahamLincoln.txt,keep
4,2.70805,AbrahamLincoln.txt,oppressed
5,2.70805,AbrahamLincoln.txt,prospects
6,2.302585,AbrahamLincoln.txt,borne
7,1.791759,AbrahamLincoln.txt,resigned
8,0.456758,AbrahamLincoln.txt,human
9,0.76214,AbrahamLincoln.txt,physical


In [24]:
tfidf = pd.merge(tf_dataframe, idf_dataframe)

In [25]:
tfidf['TFIDF_score'] = tfidf['TF_score']*tfidf['IDF_score']

In [26]:
tfidf.head()

Unnamed: 0,TF_score,doc,key,IDF_score,TFIDF_score
0,0.001553,AbrahamLincoln.txt,army,1.098612,0.001706
1,9.1e-05,AbrahamLincoln.txt,unsolicited,3.401197,0.000311
2,0.000274,AbrahamLincoln.txt,positions,1.455287,0.000399
3,9.1e-05,AbrahamLincoln.txt,keep,0.836248,7.6e-05
4,9.1e-05,AbrahamLincoln.txt,oppressed,2.70805,0.000247


In [27]:
tfidf.shape

(72981, 5)

In [28]:
tfidf.set_index("key", inplace=True)

In [29]:
tfidf.head()

Unnamed: 0_level_0,TF_score,doc,IDF_score,TFIDF_score
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
army,0.001553,AbrahamLincoln.txt,1.098612,0.001706
unsolicited,9.1e-05,AbrahamLincoln.txt,3.401197,0.000311
positions,0.000274,AbrahamLincoln.txt,1.455287,0.000399
keep,9.1e-05,AbrahamLincoln.txt,0.836248,7.6e-05
oppressed,9.1e-05,AbrahamLincoln.txt,2.70805,0.000247


In [30]:
#testing for the TF-IDF
m = "at&t"
if m in tfidf.index:
    print(tfidf.loc[m])

      TF_score                  doc  IDF_score  TFIDF_score
key                                                        
at&t  0.000384         AppleInc.txt   1.609438     0.000617
at&t  0.046663             AT_T.txt   1.609438     0.075102
at&t  0.000688           Dallas.txt   1.609438     0.001107
at&t  0.000417              IBM.txt   1.609438     0.000672
at&t  0.000261  Rchardson_Texas.txt   1.609438     0.000420
at&t  0.000074            Texas.txt   1.609438     0.000118


# TASK 2

In [31]:
corpus_words = {}
for filename in glob.glob("*.txt"):
    w = []
    if filename != "MelindaGates.txt":
        f = codecs.open(filename,'r','utf-8-sig')
    else:
        f = codecs.open(filename,'r','ISO-8859-1')
    file = f.read()
    corpus_words[filename] = file.lower()

In [32]:
def classify_questions(q):
    question_type= ""
    if q != "":
        if re.search('who',q,re.IGNORECASE):
            question_type = 'who'
        elif re.search('when',q,re.IGNORECASE):
            question_type = 'when'
        elif re.search('where',q,re.IGNORECASE):
            question_type = 'where'
        else :
            return(0)
    return(question_type)        

In [33]:
#implement task 1 on the question
def get_nlp_features(q):
    words = extract_words(q)
    lemmas = extract_lemma(q)
    pos = extract_pos(q)
    ner = extract_ne(q)
    tree = extract_parse_tree(q)
    return ({"sentence":q,"lemma":lemmas,"pos":pos,"ner": ner,"parse_tree":tree,"words":words})    

In [34]:
def get_tfidf_score(word_list):
    ques_list = ['who','when','where']
    score_data ={}
    for i in word_list:
        m = i.lower()
        if m in tfidf.index and m not in ques_list:
            l = tfidf.loc[m]
            try:
                if l.dtype == 'object':
                    l = pd.DataFrame(tfidf.loc[m]).T
            except:
                l = l.sort_values(by=['TFIDF_score'],ascending=False)
            
            files = list(l['doc'].iloc[:3])
            scores = list(l['TFIDF_score'].iloc[:3])
            score_data[i] =  [files,scores]
    return score_data

In [35]:
def rule1_filter(corpus,lemma_list):
    filter1 = []
    for i in corpus:
        common_terms = list(set(i['lemma']) & set(lemma_list)) 
        if len(common_terms) > 0:
            filter1.append(i)
            
    print("Lines Filtered from rule1_filter------> ",len(filter1))
    return filter1

In [36]:
def rule2_filter(corpus,pos):
    filter2 = []
    verb = []
    for i in pos:
        if i.split("_")[1] in ['VB','VBD','VBG']:
            verb.append(i.split("_")[0])
        if i.split("_")[1] in ['VBP','JJ','NN']:
            verb.append(i.split("_")[0])
    
    if "purchase" in verb:
        verb.append("buy")
    
    for i in corpus:  
        for p in i['lemma']:
            for h in verb:
                if p.split("_")[0] == h:
                    filter2.append(i)
    
    print("Lines Filtered from rule2_filter------> ",len(filter2))
    return filter2    
    

In [37]:
def rule3_filter(corpus,qt):
    ans_list = []
    sentences = []
    if qt == "who":
        max_score = 0
        for i in corpus:
            answers= []
            for n in i['ner']:
                if n.split("_")[1].lower() in ['person','org'] :
                    answers.append(n.split("_")[0])
            if len(answers) > 0:
                obj = {'sentence':i['sentence'],'answer':answers}
                ans_list.append(obj)
            
    if qt == "when":
        for i in corpus:
            answers=[]
            for n in i['ner']:
                    if n.split("_")[1].lower() == "date":
                        answers.append(n.split("_")[0])
            
            if len(answers) > 0:
                obj = {'sentence':i['sentence'],'answer':answers}
                ans_list.append(obj)
                
    if qt == "where":
        for i in corpus:
            answers=[]
            for n in i['ner']:
                    if n.split("_")[1].lower() in ['gpe','loc']:
                        answers.append(n.split("_")[0])
            
            if len(answers) > 0:
                obj = {'sentence':i['sentence'],'answer':answers}
                ans_list.append(obj)
    
    print("Lines Filtered from rule3_filter------> ",len(ans_list))
    return(ans_list)

In [38]:
def rule4_filter(corpus,q_obj):
    head_lines =[]
    for i in q_obj['parse_tree']:
        if i.split("_")[1] == "ROOT":
            head_of_question = i.split("_")[0]
    print("Head of Question ------> "+head_of_question)
    
    for i in corpus:
        for r in i['parse_tree']:
            if r.split("_")[1] == "ROOT" and r.split("_")[0] == head_of_question:
                head_lines.append(i)
                
    print("Lines Filtered from rule4_filter------> ",len(head_lines))
    return(head_lines)                

In [39]:
def rule5_filter(corpus,lemma,pos):
    filter4 = []
    lemmatizer = WordNetLemmatizer()
    
    syn_list = []
    verb_root_list = []
    
    for i in pos:
        if i.split("_")[1] in ['VBD','VBP','VBG','VB']:
            verb_lemma = lemmatizer.lemmatize(i.split("_")[0],pos="v")
            verb_root_list.append(verb_lemma)
        if i.split("_")[1] in ['NN']:
            verb_root_list.append(i.split("_")[0])
            
    u_verb_list = list(set(verb_root_list))
    
    for i in u_verb_list:
        if i in unique_word_relations:
                m = unique_word_relations[i]
                for l in m:
                        syn_list.extend(l['relations']['syn'])
                        syn_list.extend(l['relations']['hyper'])
                        syn_list.extend(l['relations']['hypo'])
            
    unique_syn_list = list(set(syn_list))
    
    for i in corpus:
        for k in unique_syn_list:
            if k in i['lemma']:
                filter4.append(i)
                
    
    print("Lines Filtered from rule5_filter------> ",len(filter4))
    return filter4

In [40]:
def rule6_filter(corpus,ques):
    filter_list =[]
    filter5 = []
 
    for i in corpus:
        common_terms = list((set(list(i['lemma'])) & set(ques)))
        filter5.append(len(common_terms))

    
    max_answers =  sorted(range(len(filter5)), key=lambda i: filter5[i], reverse=True)[:2]
    
    for i in max_answers:
        filter_list.append(corpus[i])
        
    print("Lines Filtered from rule6_filter------> ",len(filter_list))
    return filter_list  

In [41]:
def most_probable_file(score,ner):
    max_file = []
    max_score = 0
    agg_score = {}
    for i in ner:
        j = i.split("_")[0].split(" ")
        for k in j:
            if k in score:
                for l in range(len(score[k][0])):
                    if score[k][0][l] in agg_score:
                        agg_score[score[k][0][l]] += score[k][1][l]
                    else:
                        agg_score[score[k][0][l]] = score[k][1][l]
                        
    max_file =  nlargest(1, agg_score, key = agg_score.get)
    return(max_file)
    

In [42]:
def get_answer(q_obj,files):
    corpus = []
    answers = []
    
    print("Most Probable Files ---------> ",files)
    
    for i in files:
        for j in data_text_1:
            if j['doc'] == i:
                corpus = j['sentences']
    
        filter1 = rule1_filter(corpus,q_obj['obj']['lemma'])
        if filter1 != None:
            filter4 = rule5_filter(filter1,q_obj['obj']['lemma'],q_obj['obj']['pos'])
            filter2 = rule4_filter(filter4,q_obj['obj'])
            if len(filter4) == 0:
                filter2 = rule4_filter(filter1,q_obj['obj'])
            if len(filter2) == 0:
                filter2 = rule2_filter(filter4,q_obj['obj']['pos'])  
            if filter2 != None:
                filter5 = rule6_filter(filter2,q_obj['obj']['lemma'])
                filter3 = rule3_filter(filter5,q_obj['qt'])
                obj = {'document':i,'info':filter3}
                answers.append(obj)
    return(answers)            

In [43]:
def adding_semantic_rules(obj):
    for i in obj['ner']:
        if i.split("_")[0] == "UTD":
            i.split("_")[0].replace("UTD","UT Dallas")
            obj['ner'].append("UT Dallas_"+i.split("_")[1])
            obj['ner'].append("University of Texas at Dallas_"+i.split("_")[1])
        if i.split("_")[0] == "Abraham Lincoln":
            obj['ner'].append("Lincoln_"+i.split("_")[1])
        if i.split("_")[0] == "ExxonMobile":
            obj['ner'].append("ExxonMobil_"+i.split("_")[1])
    return obj

# TASK 3

In [44]:
def create_json_string(ans_dict,q,doc):
    sentences = list(ans_dict.keys())
    answers = list(ans_dict.values())
    sent_dict = {}
    ans_dict ={}
    doc_dict ={}
    for i in range(len(sentences)):
        sent_dict[i+1] = sentences[i]
        ans_dict[i+1] = str(" ".join(answers[i]))
        doc_dict[i+1] = doc
        
    obj = json.dumps({"Question":q,"answers":ans_dict,"sentences":sent_dict,"documents":doc_dict},sort_keys=False)
    return(obj)    

In [47]:
input_file = input("Enter the question file path --->")
if input_file == "":
    input_file = "<give some default path name>"

Enter the question file path --->C:/Users/taniy/Desktop/Spring19/CS6320-NLP/Project/que1.txt


In [48]:
f= open(input_file,'r')
l = f.read().split('\n')
final_answer_list =[]
for i in l:
    if len(i)>=1:
        question = str(i.strip())
        print(question)
        print("--------------------------------------------------------------------------------")
        print("Processing Details")
        print("------------------")
        
        qt = classify_questions(question)
    
        obj = get_nlp_features(question)
        
        score = get_tfidf_score(obj['words'])
       
        semantic_obj= adding_semantic_rules(obj)
     
        file = most_probable_file(score,obj['ner'])
        
        relations = extract_relations(obj['lemma'])
        
        question_obj = {'q':question,'qt':qt,'obj':semantic_obj,'relation':relations,'score':score}
        
        answer = get_answer(question_obj,file)
        
        for i in answer:
            unique_question = {}
            for s in i['info']:
                if s['sentence'] not in unique_question:
                    unique_question[s['sentence']] = s['answer']
            final_answer_list.append((create_json_string(unique_question,question,i['document'])))
            for f in unique_question:
                print("--------------------------------------------------------------------------------")
                print("Answer : "," ".join(unique_question[f]))
                print("Sentence : ",f)
                print("Document : ",i['document'])
                print("--------------------------------------------------------------------------------")
        print("********************************************************************************")
        
print("Final Answers fetched ---------> ",len(final_answer_list))
try:
    os.mkdir('result_files')
except:
    print("Directory Already Exists")

with open('result_files/Max_Entropy_S19_NLP_Project.json', 'w') as outfile:  
    json.dump(final_answer_list, outfile)
print("Result file created at Location "+path+"/result_files/Max_Entropy_S19_NLP_Project.json")

Who was 16th president of the United States?
--------------------------------------------------------------------------------
Processing Details
------------------
Most Probable Files --------->  ['UnitedStates.txt']
Lines Filtered from rule1_filter------>  773
Lines Filtered from rule5_filter------>  907
Head of Question ------> was
Lines Filtered from rule4_filter------>  62
Lines Filtered from rule6_filter------>  2
Lines Filtered from rule3_filter------>  1
--------------------------------------------------------------------------------
Answer :  Virginia Gazette
Sentence :  The first known publication of the phrase "United States of America" was in an anonymous essay in The Virginia Gazette newspaper in Williamsburg, Virginia, on April 6, 1776.
Document :  UnitedStates.txt
--------------------------------------------------------------------------------
********************************************************************************
When did Amazon surpass Walmart as the most valua