In [124]:
import numpy as np
import json
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from text_processing.text_preprocessing import df_to_json

In [125]:
#filter columns from a dfrow that has been converted into a geneartor 
def filterKeys(document_gen,vars2use):
    return {key:document_gen[key] for key in vars2use}

#generate douments in elastic format 
#yeild is used to allow the function to hand off records to Elastic API only when it asks for them 
def doc_generator(df,index_name, docIdVar, vars2use = None):
    """
    Overview: 
        provide a dict with specified values to elastic bulk api 

    Inputs: 
        df : dataframe to push to elastic 
        index_name: name of index you want to push data to (database)
        docIdVar : id variable in input dataframe 
        vars2use : list of variables from input df we want to push to elastic 
    
    Yield parameters:
        _index: the database name
        _type: the table name (this is now _doc for 7.1 and cannot be any other value) 
        _id: elastic unique ID (not the same as 'id' field from df)
        _source: the document to be saved (you could also simply use document.to_dit())
        raise StopIteration: raise exception when generator is empty
    """

    df_iter = df.iterrows()
    for index, document in df_iter:
        yield {
                "_index": index_name,
                "_type": "_doc",
                "_id" : f"{document[docIdVar]+str(index)}",
                "_source": filterKeys(document,vars2use),
            }
    raise StopIteration
    
#QUERY OPS
def search(es_object, index_name, search):
    res = es_object.search(index=index_name, body=search)

#### STEP 1: Load df to push to index 

In [126]:
df = pd.read_pickle("C:\\Users\\zjc10\\Desktop\\Projects\\data\\news\\webhose_news\\webhose_df.pickle")

In [222]:
df.columns

Index(['key', 'date', 'title', 'author', 'link', 'text'], dtype='object')

#### STEP 2: MISSING DATA CHECK (ELASTIC CANNOT HANDLE NAN)

In [128]:
if df.isnull().sum().sum() > 0:
    print("STOP AND FIX MISSING DATA")        

#### STEP 3: Push docs to elastic 

In [43]:
es_client = Elasticsearch(http_compress=True)
vars2use = ['key','date','title','author']
helpers.bulk(es_client,doc_generator(df, "webhose","key",vars2use=vars2use))

KeyboardInterrupt: 

#### Query Operations though Python

##### search results contain following metadata (that is useful)
 - max_score: the score of the most relevant document found 
 -  hits.total.value: how many matching documents were found 
 -  hits._score: the documents relevance score (not applicable when using match_all)
 
#### boosting within match query 
 - individual fields can be boosted with the caret ^ notation (ex. "fields":['title^2','author'])
 

In [221]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

#all titles containing Nintendo in them (from oldest to newest)
#something funky going on with date, check out how it is hashed (is it a date field or string)
match_response = es.search(
    index='webhose',
    body={
        'query': {
            'match': {'title':'Nintendo'},
        },
    },
    sort={
        'date': 'desc',
    },
)

#zero term query 
#auto_generate_synonyms_phrase_query (defaults to true)
zeroterm_response = es.search(
    index='webhose',
    body={
        'query': {
            'match': {
                'title':{
                    "query":'Launch ny',
                    'operator':"and",
                    'zero_terms_query':'all',
                     "auto_generate_synonyms_phrase_query" : True,
                    },
                },
            },
        },
)


#fuzzy results ranked by relevance score (so if it only contains 1 of the 2 words, it will be lower in results)
#auto_generate_synonyms_phrase_query (defaults to true)
fuzzy_response = es.search(
    index='webhose',
    body={
        'query': {'match':
                  {'title':{
                    "query":'Launch ny'
                    ,"fuzziness":2
                    },
                },
            },
        },
)

#multi-match queries 
#allows multi-field queries 
mm_results = es.search(
    index='webhose',
    body={
        'query': {"multi_match":{
                  "query": "i hate amazon",
                  "fields":['title^2','author']
                    }
                }
            }
)

#bool queries 
#match_bool_prefix(matches terms in phrase in any position in the searched text, good for words in different orders): analyzes input and consutructs a bool query from the terms. Each term EXCEPT the last is used in a term query. The last term is used in a prefix qery 
#match_phrase_prefix: matches its terms as a phrase 

#analyzer used by bool queries can be confiured with the analyzer paramter
match_bool = es.search(
    index='webhose',
    body={
        'query': {
            'bool':{
                "should":[
                    {"term":{"title": "nintendo"}},
                    {"term":{"title":"launch"}},
                    {"prefix":{"title":"ny"}},
                    
                ]
            }
        }
    }
)

#bool query with distance feature boosting 
#boosting results that are more recent 
match_bool_dist = es.search(
    index='webhose',
    body={
        'query': {
            'bool':{
                "must":{
                    "match":{
                        "title":{
                            "query":"amazon",
                            "fuzziness":1
                        }
                    }
                }, 
                
                "should":{
                    "distance_feature":{
                        "field":"date",
                        "pivot":"10d",
                        "origin":"now-1700d"
                    },
              
                }
                    
            }
        }
    }
)
 

#analyzer used by bool queries can be confiured with the analyzer paramter
#note:keyword analyzer very strict 
match_bool_analyze = es.search(
    index='webhose',
    body={
        'query': {
            'match_bool_prefix':{
                "title":{
                    "query":"quick nintendo f", 
                   # "analyzer":"keyword",
                    "fuzziness":2,
                }
                
            }
        }
    }
)

#show result
mm_results

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 29.519335,
  'hits': [{'_index': 'webhose',
    '_type': '_doc',
    '_id': '6726f5aeab38ecfe8ab1aba7db6f8a12c5e3cc860',
    '_score': 29.519335,
    '_source': {'key': '6726f5aeab38ecfe8ab1aba7db6f8a12c5e3cc86',
     'date': '2017-03-25T01:09:00.000+03:00',
     'title': 'I hate Fridays in Lent..',
     'author': 'Billybob (noreply@blogger.com)'}},
   {'_index': 'webhose',
    '_type': '_doc',
    '_id': '7af3f8186071961cf9b599678280e03c9221f5f00',
    '_score': 29.519335,
    '_source': {'key': '7af3f8186071961cf9b599678280e03c9221f5f0',
     'date': '2017-03-13T04:54:00.000+02:00',
     'title': 'I hate this planet and ..',
     'author': ''}},
   {'_index': 'webhose',
    '_type': '_doc',
    '_id': '8a7c9188b4f827147e5fa8f389eab6d6413eb14e0',
    '_score': 28.098354,
    '_source': {'key': '8a7c9188b4f827147e5

### Most like this(MLT) query syntax
 - finds documents that are like a set of documents
 - 3 types of parameters 
     - document input 
     - term selection 
     - query formation 

#### Document Input Parameters 
 - Like: ONLY required parameter of MLT query 
     - to provide documents not ncessarly present in index, **ARTIFICAL DOCUMENTS** are also supported 
 - Unlike: exact opposite of like , tells search to return only info that is like something and not like something else 
 - Fields: a list of fifelds to fetch and analyze the text from 
 
#### Term Selection Paramters 
- max_query_terms: max # of query terms to be selected
    - Increasing this value gives greater accuracy at the expense of execution spped 
- min_term_freq: min doc freq below which the term will be ignored from the input doc (default 5)
- min_doc_freq: the minimum doc freq below which the term is ignored (default 5) 
- max_doc_freq: max doc freq above which the term will be ignored, great for filtering out ocmmon words from input string(default infinity)
- min_word_length: the min len below which a term is ignored 
- max_word_length: max len of word to be considred from input string 
- stop_words: an array of stopwords, any word in this set is dubbed 'uninteresting' and ignored

#### Query Formation Parameters 
 - minium_should_match : controls the number of terms that must match(defaults 30%)
 - fail_on_unsupported_field: specifies if the query should fail if any specified fields are not of the supported types(text or keyword). Defaults to TRUE.
     - set to FALSE to ifnore the field and continue processing 
 - boost_terms(default = 0): sets the boost factor to use based on terms tf-idf score. ANy positive value activates term boosting with the given booost factor 
 - include(default = False): specifies weather the input docs should also be includeed in search results
 
##### NOTE: Term Vectors API provides a good preprocess for MLT query 

In [185]:
#return documents that are similar to a provdied peice of text (specified in 'like' query parameter)
mlt_result1 = es.search(
    index = 'webhose', 
    body = {
            "query": {
                "more_like_this" : {
                    "fields" : ["title"],
                    "like" : "Amazon cloud failure",
                    "min_term_freq" : 1,
                    "min_doc_freq": 1,
                    "max_query_terms" : 24
                }
            }
        }
)

#mixing texts with documents already existing in index (find docs similar to existing docs)
#ex. find docs related to the existing doc with _id = ....
mlt_result = es.search(
    index = 'webhose', 
    body = {
            "query": {
                "more_like_this" : {
                    "fields" : ["title", "author"],
                    "like" : [
                     {
                         "_index":"webhose",
                         "_id":"aab7ad9ce6391f0e3da6c3ea69ba834f6b2ed4c90"
                     },
                    ],
                    "min_term_freq" : 1,
                    "max_query_terms" : 12,
                    "include":True,
                    "boost_terms":.5,
                    "minimum_should_match":"30%",
                    "stop_words":['a','for','your']
                }
            }
        }
)

#return documents that are similar to a provdied peice of text (specified in 'like' query parameter)
mlt_result1 = es.search(
    index = 'webhose', 
    body = {
            "query": {
                "more_like_this" : {
                    "fields" : ["title"],
                    "like" : ["Amazon cloud failure","s3"],
                    "unlike": "free shipping",
                    "stop_words": ['a','the','is','then','could'],
                    "min_term_freq" : 1,
                    "min_doc_freq": 1,
                    "max_query_terms" : 24,
                    "minimum_should_match":"20%",
                    "fail_on_unsupported_field":True,
                    "boost_terms":.5,
                }
               
            }
        }
)


#### TERM VECTORS API 

##### three types of values(field and term information returned by default)
 - term information (always returned)
 - term statistics (positions=True)
 - field statistics (payloads = True) 
    

In [195]:
#https://www.elastic.co/guide/en/elasticsearch/reference/7.4/docs-termvectors.html
#return documents that are similar to a provdied peice of text (specified in 'like' query parameter)
tv_result1 = es.termvectors(
    index = 'webhose', 
    
    body = {
          "fields":["title"],
          "_id":"aab7ad9ce6391f0e3da6c3ea69ba834f6b2ed4c90",
          "term_statistics":True    
        }
    
)

### CROSS FIELD MULTI MATCH QUERIES

In [242]:
#WARNING: I ONLY HAVE 3 FIELDS , SO I AM REDUNDENTLY SEARCHING TWO of THEM FOR illistration
#https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-minimum-should-match.html
tv_result1 = es.search(
    index = 'webhose', 
    
    body = {
    
      "query": {
        "bool": {
          "should": [
            #setting up 2 cross_field queries combined with a bool query, applying min_should_match parameter to one of them 
              #when using multiple queries, only apply min_should_match to one 
            {
              "multi_match" : {
                "query":      "amazon outage",
                "type":       "cross_fields",
                "fields":     [ "author", "title" ],
                "tie_breaker": 1 , #instead of judging results based on single best score, add togetyher scores (across fields) for the query terms *not smart likley*
                "minimum_should_match": "50%"        #1 of the two fields should contain the information 
              }
            },
            {
              "multi_match" : {
                "query":      "amazon outage",
                "type":       "cross_fields",
                "fields":     [ "title","author" ],
                "tie_breaker": 1 ,
              }
            }
          ]
        }
      }
    
    }
)

In [243]:
tv_result1

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 16.210983,
  'hits': [{'_index': 'webhose',
    '_type': '_doc',
    '_id': '14972b1121dbbb3a9b825679466cf905c69b1ef60',
    '_score': 16.210983,
    '_source': {'key': '14972b1121dbbb3a9b825679466cf905c69b1ef6',
     'date': '2017-03-03T17:05:00.000+02:00',
     'title': 'Amazon Outage Caused By Typo',
     'author': ''}},
   {'_index': 'webhose',
    '_type': '_doc',
    '_id': 'dc88fc0b1732c21674e17603cdfb2401d3a98dee0',
    '_score': 16.210983,
    '_source': {'key': 'dc88fc0b1732c21674e17603cdfb2401d3a98dee',
     'date': '2017-03-02T13:29:00.000+02:00',
     'title': 'ISTEP spared by Amazon outage',
     'author': ''}},
   {'_index': 'webhose',
    '_type': '_doc',
    '_id': 'ae347fff757f5297d0711cbead4a663c8944e25d0',
    '_score': 16.210983,
    '_source': {'key': 'ae347fff757f5297d0711cbead4a663c8944e25d'