In [None]:
from elasticsearch import Elasticsearch
from datetime import datetime
from flask import Flask, render_template, request
import csv
import re


In [None]:

es = Elasticsearch([{'host': 'localhost', 'port':9200, 'scheme': 'http'}], http_auth=("vijay", "password"))
es.info()

In [None]:
cnbc_documents = list()
guardian_documents = list()
reuters_documents = list()

with open("cnbc_headlines.csv", "r", encoding='utf-8') as file:

    reader = csv.reader(file)

    headers = next(reader)

    for row in reader:

        data = dict()

        data['News Organization'] = 'CNBC'

        i = 0

        for h in headers:

            data[h] = row[i]
            i += 1
        
        cnbc_documents.append(data)


with open("guardian_headlines.csv", "r", encoding='utf-8') as file:

    reader = csv.reader(file)

    headers = next(reader)

    for row in reader:

        data = dict()

        data['News Organization'] = 'The Guardian'

        i = 0

        for h in headers:

            data[h] = row[i]
            i += 1
        
        guardian_documents.append(data)


with open("reuters_headlines.csv", "r", encoding='utf-8') as file:

    reader = csv.reader(file)

    headers = next(reader)

    for row in reader:

        data = dict()

        data['News Organization'] = 'Reuters'

        i = 0

        for h in headers:

            data[h] = row[i]
            i += 1
        
        reuters_documents.append(data)


print(cnbc_documents[:10])
print()

print(guardian_documents[:10])
print()

print(reuters_documents[:10])
print()

In [None]:
def index_documents(index, id, document):
    
    resp = es.index(index=index, id=id, document=document)


In [None]:
for i in range(len(cnbc_documents)):
    
    es.index(index = "cnbc_index", id = i + 1, document = cnbc_documents[i])

for i in range(len(guardian_documents)):
    
    es.index(index = "guardian_index", id = i + 1, document = guardian_documents[i])

for i in range(len(reuters_documents)):
    
    es.index(index = "reuters_index", id = i + 1, document = reuters_documents[i])

In [None]:
resp = es.get(index="cnbc_index", id=1)
print(resp['_source'])
print()

resp = es.get(index="guardian_index", id=1)
print(resp['_source'])
print()

resp = es.get(index="reuters_index", id=1)
print(resp['_source'])
print()

In [None]:

def search_query(query, datasets_to_search, index_names):
    
    body = {
        "size": 20,
        "query": {
            "dis_max": {
                "queries": [
                    {"match": {"Headlines": query}},
                    {"match": {"Description": query}}
                ],
                "tie_breaker": 0.3
            }
        }
    }

    results = []

    for i in range(len(datasets_to_search)):

        resp = es.search(index=index_names[i], body=body)

        for hit in resp['hits']['hits']:
            
            news_organization = hit["_source"].get('News Organization')
            headline = hit["_source"].get('Headlines')
            time = hit["_source"].get('Time')
            description = hit["_source"].get('Description')

            formatted_result = f"'News Organization': {news_organization}\nHeadline: {headline}\nTime: {time}\nDescription: {description}\n\n"
            
            formatted_result = formatted_result.replace('\n', '<br>')

            results.append(formatted_result)

    return results


In [None]:
app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def homepage():

    results = []

    if request.method == 'POST':

        query = request.form.get('search_query')

        datasets_to_search = list()
        index_names = list()

        if request.form.get('cnbc'):

            datasets_to_search.append(cnbc_documents)
            index_names.append("cnbc_index")
        
        if request.form.get('the guardian'):

            datasets_to_search.append(guardian_documents)
            index_names.append("guardian_index")
        
        if request.form.get('reuters'):

            datasets_to_search.append(reuters_documents)
            index_names.append("reuters_index")

        results = search(query, datasets_to_search, index_names)
    
    return render_template('test.html', results=results)

def search(query, datasets_to_search, index_names):

    return search_query(query, datasets_to_search, index_names)


app.run(debug=False)