# Title: Search Engine Alpha 2.0
<p><b>Abstract:</b> Search engine build using Reuters data set</p>
<p><b>Authors:</b> Uriel Antonio & Ernesto Louie Cortez</p>
<p><b>Date:</b>    04/30/2016</p>

In [None]:
from xml.etree import ElementTree
import re
from StringIO import StringIO
from bs4 import BeautifulSoup
import os 
import pandas as pd


## Data Cleaning

<p><b>Abstract: </b>Reuters data is organized in a generalized markup format.  Using BeatifulSoup, articles are scraped for their Title, content, date and place acording to their respective markup tags</p>

<p><b>Challenges: </b>1) We initially made the assumption that all articles would have a date, place, title, and body.  During alpha 1 stage, it was found that titles and content became mismatched approximately 30 articles in.  Conditional statements were created to insert placeholders when content was missing.  2) The cleaned data was also littered with mark up tags for special symbols that also needed to be cleaned.    </p>


<p><b></b></p>

In [None]:
totstring=""

with open("Data/reut2-000.sgm",'r') as inF:
    for line in inF:
        string2=re.sub("&.*?>","",line,flags=re.UNICODE)
        string3=re.sub("\n"," ",string2,flags=re.UNICODE)
        string=re.sub("[^0-9a-zA-Z<>/\s=!-\"\"]+","",string3.lower())
        totstring+=string
    
soup= BeautifulSoup(totstring)

items_date=list()
items_places=list()
items_title=list()
items_body=list()


for a in soup.findAll("reuters"):
    if a.date != None:
        items_date.append(a.date.getText())
    else:
        items_date.append("N/D")
    if a.places != None:
        items_places.append(a.places.getText()) 
    else:
        items_places.append("N/L")
    if a.title != None:
        items_title.append(a.title.getText())  
    else:
        items_title.append("Untitled")
    if a.content != None:
        items_body.append(a.content.getText())
    else:
        items_body.append("No Content.")
  

corpus = items_title[0:25]
print(corpus)


In [None]:
"""
tf = {}
for doc in corpus:
    for word in doc.split():
        # << COMPUTE ERM FREQUENCY DICTIONARY >> CODE HERE
        ## HIDE
        if word in tf:
            tf[word] += 1
        else:
            tf[word] = 1
        ## HIDE

print(tf)
"""

In [None]:
from collections import Counter

def get_tf(corpus):
    tf = Counter()
    for doc in corpus:
        for word in doc.split():
            # << CODE HERE
            ## HIDE
            tf[word] += 1
    return tf

tf = get_tf(corpus)
print(tf)

In [None]:
tf = get_tf(items_title)
#print(tf)
print(tf['oil'])
print(tf['national'])
print(tf['japan'])


In [None]:
def get_tfd(corpus):
    tfd = {}
    for i,doc in enumerate(corpus):
        tfd[i]={}
        for word in doc.split():
            if word in tfd[i]:
                tfd[i][word] += 1
            else:
                tfd[i][word] = 1
    return tfd
            
    
tfd = get_tfd(items_title)
tfd[234]

In [None]:
tfd = get_tfd(items_body)
tfd[234]

In [None]:
def get_tfm(corpus):
    
    def get_lexicon(corpus):
        lexicon = set()
        # << COMPUTE SET OF TERMS IN CORPUS >> CODE HERE
        ## HIDE
        for doc in corpus:
            lexicon.update([word for word in doc.split()])
        return list(lexicon)
        ## HIDE
        
    lexicon = get_lexicon(corpus)

    tfm =[]
    for doc in corpus:
        tfv = [0]*len(lexicon)
        for term in doc.split():
            # << COMPUTE DOCUMENT TERM FREQUENCY VECTOR AND APPEND TO tfm >> CODE HERE
            ## HIDE
            tfv[lexicon.index(term)] += 1
            ## HIDE
        tfm.append(tfv)
        
    return tfm, lexicon

In [None]:
def get_results_tf(qry, tfm, lexicon):
    qrv = [0]*len(lexicon)
    for term in qry.split():
        if term in lexicon:
            qrv[lexicon.index(term)] = 1

    results = []      
    for i, tfv in enumerate(tfm):
        score = 0

        score = sum([ xy[0] * xy[1] for xy in zip(qrv,tfv)])

        results.append([score, i])
    
    sorted_results = sorted(results, key=lambda t: t[0] * -1 )
    return sorted_results


def print_results(results,n, head=True):
    ''' Helper function to print results
    '''
    if head:    
        print('\nTop %d from recall set of %d items:' % (n,len(results)))
        for r in results[:n]:
            print('\t%0.2f - %s'%(r[0],items_title[r[1]]))
    else:
        print('\nBottom %d from recall set of %d items:' % (n,len(results)))
        for r in results[-n:]:
            print('\t%0.2f - %s'%(r[0],items_title[r[1]]))
    

tfm, lexicon = get_tfm(items_title)
results = get_results_tf('led bike light', tfm , lexicon)
print_results(results,10)

In [None]:
def create_inverted_index(corpus):
    idx={}
    for i, doc in enumerate(corpus):
        for word in doc.split():
            if word in idx:
                idx[word].append(i)
            else:
                idx[word] = [i]
    return idx

idx = create_inverted_index(items_title)
print(items_title[50])

In [None]:
def get_results_tf(qry, idx):
    score = Counter()
    for term in qry.split():
        for doc in idx[term]:
            score[doc] += 1
            
    results=[]
    for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]:
        if x[1] > 0:
            results.append([x[1],x[0]])

    sorted_results = sorted(results, key=lambda t: t[0] * -1 )
    return sorted_results;


idx = create_inverted_index(items_body)
results = get_results_tf('national japan oil ', idx)
print_results(results,10)

In [None]:
import math

def create_inverted_index(corpus):
    idx={}
    for i, doc in enumerate(corpus):
        for word in doc.split():
            if word in idx:
                if i in idx[word]:
                    idx[word][i] += 1
                else:    
                    idx[word][i] = 1
            else:
                idx[word] = {i:1}
    return idx

def get_results_tfidf(qry, idx, n):
    score = Counter()
    for term in qry.split():
     
        if term in idx:
      
            i = math.log(float(n)/(1+len(idx[term])))
            for doc in idx[term]:
                score[doc] += idx[term][doc] * i

    results=[]
    for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]:
        if x[1] > 0:
            results.append([x[1],x[0]])
    
    sorted_results = sorted(results, key=lambda t: t[0] * -1 )
    type(score)
    print(score)
    return sorted_results

idx_body = create_inverted_index(items_body)



#results = get_results_tfidf('this is the end of the file', idx, len(items_body))
#results = get_results_tfidf('japan the oil bankrupt this', idx, len(items_body))
results = get_results_tfidf('The japan country', idx_body, len(items_body))
#results = get_results_tfidf('japan oil bankrupt', idx, len(items_body))
print(items_title[319])
print_results(results,10)