# Elasticsearch search query
In this session, we are going to perform outstanding search query operations in elasticsearch engine
1. First we read some data from csv file so that these data can be ingested into elastic search.
2. we will make our mapping for elastic schema
3. we use another popular function to ingeste Bulk data with parallal process.
4. Now would be ready to go fro search query

In [1]:
# Use panda module to read csv file and clean data 
import pandas as pd

In [15]:
# Here we are just reading a csv file
file_path = "/home/nyalazone/Desktop/ElasticSearch/ElasticSearchv/data/mart_data.csv"
dataset = pd.read_csv(file_path,sep = ',')
dataset.fillna(method='ffill',inplace = True) # Fill null values
dataset.head(7)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
6,FDO10,13.65,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528


In [3]:
# Schema Information
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            279 non-null    object 
 1   Item_Weight                279 non-null    float64
 2   Item_Fat_Content           279 non-null    object 
 3   Item_Visibility            279 non-null    float64
 4   Item_Type                  279 non-null    object 
 5   Item_MRP                   279 non-null    float64
 6   Outlet_Identifier          279 non-null    object 
 7   Outlet_Establishment_Year  279 non-null    int64  
 8   Outlet_Size                279 non-null    object 
 9   Outlet_Location_Type       279 non-null    object 
 10  Outlet_Type                279 non-null    object 
 11  Item_Outlet_Sales          279 non-null    float64
dtypes: float64(4), int64(1), object(7)
memory usage: 26.3+ KB


In [16]:
# Elastic search Mapping for dataset
def get_mapping():
    data_mapping = {"mappings":
        {
            "properties": {

                "Item_Identifier": {
                    "type": "text"
                },
                "Item_Weight": {
                    "type": "float"
                },

                "Item_Fat_Content": {
                    "type": "text"
                },
                "Item_Visibility": {
                    "type": "text"
                },

                "Item_Type": {
                    "type": "text"
                },
                "Item_MRP": {
                    "type": "float"
                },
                "Outlet_Identifier": {
                    "type": "text",
                },
                "Outlet_Establishment_Year": {
                    "type": "integer"
                },
                "Outlet_Size": {
                    "type": "text"
                },
                "Outlet_Location_Type": {
                    "type": "text"
                },
                "Outlet_Type": {
                    "type": "text"
                },
                "Item_Outlet_Sales": {
                    "type": "float"
                },
            }
        }
    }
    return data_mapping


In [6]:
#Converts pandas dataframe to list of dictionary so that we can insert in elastic
dataset_dict = dataset.to_dict('records')

In [None]:
# Now we have cleaned dataset,and mapping, we can go ahead

In [23]:
# Initiate ElasticSearch
from elasticsearch import Elasticsearch,ElasticsearchException
host = 'http://localhost:9200/'
elastic_obj = Elasticsearch([host]) # elastci_object
index_name = 'market_data'
if not elastic_obj.ping():
    print("Elasticsearch server is not running")
else:
    print("Elastic search engine is running........")


Elastic search engine is running........


In [24]:
# Define Create Index method with specified mapping
def creat_index():
    res = elastic_obj.indices.create(index = index_name,ignore=400,body = get_mapping())


In [25]:
# Defining elastic format same as we did earlier
def elastic_format(row):
    return {"_index":index_name, "_doc": "_doc", "_source": row}

In [41]:
# Now we rae gooing to use parallel bulk function to insert data into elastic

In [26]:
from elasticsearch.helpers import parallel_bulk, BulkIndexError

# Define parallel processing method
def parallel_to_elasic(elastic_obj, data, thread=3, chunk=100):
    doc_success,doc_fail = 0,0
    """
    The parallel_bulk() api is a wrapper around the bulk() api to provide threading.
    parallel_bulk() returns a generator which must be consumed to produce results.
    Parameters:
    dictionaryData((List of dict) : list of document/records
    thread(int) : Number of Thread
    chunk(int) : Chunk_size is used to get data from iterator
    """

    print("Document Length:", len(data))
    for success, action in parallel_bulk(client=elastic_obj, actions=data, thread_count=thread, chunk_size=chunk,
                                         raise_on_error=True):
        if success:
            doc_success += 1
        else:
            doc_fail += 1
        
    print("Success:",doc_success)
    print("Fail:", doc_fail)
    

In [43]:
# Now All are set, Lets Ingest our Data

In [27]:
def insert_data_into_elastic(dataset_dict):
    error_msg = 'None'
    try:
        elastic_data = []
        # convert dataset row to elastic document format
        for row in dataset_dict:
            elastic_data.append(elastic_format(row))
        
        creat_index() # Create index
        parallel_to_elasic(elastic_obj, elastic_data)  # Save to Elastic
        
    except ElasticsearchException as e:
        error_msg = str(e)
    except BulkIndexError as e:
        error_msg = str(e)
    except Exception as e:
        error_msg = str(e)
    finally:
        print("===========Finally block==========")
        print("Error:",error_msg)
        

In [28]:
# Now just call insert_data_into_elastic function it will create and ingest data into elasticsearch engine
insert_data_into_elastic(dataset_dict)

Document Length: 279
Success: 279
Fail: 0
Error: None


# Elastic Search Query Method
As far as,we have covered,readiing data,cleaning it,building mapping,and, elastic format, and paralell bulk insert method, as a result we have stored data inti elastcisearch engine. Now lets query it

# Match query
Returns documents that match a provided text, number, date or boolean value. The provided text is analyzed before matching.
The match query is the standard query for performing a full-text search, including options for fuzzy matching

In [88]:
# Look at our dataset
dataset.head(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [116]:
# Make a method that fetchs elastic data
import json
def fetch_elastic_data(query):
    try:
        data = elastic_obj.search(index=index_name, body=query)
    except ElasticsearchException as e:
        print(str(e))
    hits = data['hits']['hits']
    return hits

In [117]:
# Get records where item_type is Soft Drinks
query = {
  "query": {
    "match": {
      "Item_Type": {
        "query": "Soft Drinks"
      }
    }
  }
}
records = fetch_elastic_data(query)
records

[{'_index': 'market_data',
  '_type': '_doc',
  '_id': 'fdSsLXcB-15iC-jtLHKz',
  '_score': 4.709462,
  '_source': {'Item_Identifier': 'DRE60',
   'Item_Weight': 6.635,
   'Item_Fat_Content': 'low fat',
   'Item_Visibility': 0.278974075,
   'Item_Type': 'Soft Drinks',
   'Item_MRP': 225.372,
   'Outlet_Identifier': 'OUT019',
   'Outlet_Establishment_Year': 1985,
   'Outlet_Size': 'Small',
   'Outlet_Location_Type': 'Tier 1',
   'Outlet_Type': 'Grocery Store',
   'Item_Outlet_Sales': 679.1160000000001}},
 {'_index': 'market_data',
  '_type': '_doc',
  '_id': '8dSsLXcB-15iC-jtLHK1',
  '_score': 4.709462,
  '_source': {'Item_Identifier': 'DRJ13',
   'Item_Weight': 12.65,
   'Item_Fat_Content': 'Low Fat',
   'Item_Visibility': 0.062837968,
   'Item_Type': 'Soft Drinks',
   'Item_MRP': 161.5578,
   'Outlet_Identifier': 'OUT013',
   'Outlet_Establishment_Year': 1987,
   'Outlet_Size': 'High',
   'Outlet_Location_Type': 'Tier 3',
   'Outlet_Type': 'Supermarket Type1',
   'Item_Outlet_Sales': 2

**Note, Records are aailable in '_source' key**
You can check result in json_format as per your choice

# Display result in Tabular form
My choice is to convert result into pandas dataframe so that we can view it in tabular format

In [115]:
def show_result(elastic_result):
    list_dict = []
    for row in elastic_result:
        data = row['_source']
        list_dict.append(data)
    
    datafram = pd.DataFrame(list_dict)
    return datafram 
    

In [119]:
records = fetch_elastic_data(query)
df_frame = show_result(records)
df_frame.head(10) # Limit result

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,DRE60,6.635,low fat,0.278974,Soft Drinks,225.372,OUT019,1985,Small,Tier 1,Grocery Store,679.116
1,DRJ13,12.65,Low Fat,0.062838,Soft Drinks,161.5578,OUT013,1987,High,Tier 3,Supermarket Type1,2406.867
2,DRH01,17.5,Low Fat,0.097904,Soft Drinks,174.8738,OUT046,1997,Small,Tier 1,Supermarket Type1,2085.2856
3,DRZ11,8.85,Regular,0.113124,Soft Drinks,122.5388,OUT018,2009,Medium,Tier 3,Supermarket Type2,1609.9044
4,DRF49,7.27,Low Fat,0.071078,Soft Drinks,114.2518,OUT046,1997,Small,Tier 1,Supermarket Type1,2618.5914
5,DRK01,7.63,Low Fat,0.061053,Soft Drinks,95.4436,OUT035,2004,Small,Tier 2,Supermarket Type1,1418.154
6,DRH37,17.6,Low Fat,0.041701,Soft Drinks,164.8526,OUT045,2002,Small,Tier 2,Supermarket Type1,2302.3364
7,DRI25,19.6,Low Fat,0.03397,Soft Drinks,55.1614,OUT045,2002,Medium,Tier 2,Supermarket Type1,1381.535
8,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
9,DRF36,16.1,LF,0.023625,Soft Drinks,189.3846,OUT045,2002,Medium,Tier 2,Supermarket Type1,3630.6074


# Same query can be concise
You can simplify the match query syntax by combining the <field> and query parameters.

In [120]:
query = {
  "query": {
    "match": {
      "Item_Type": "Soft Drinks"
    }
  }
}
records = fetch_elastic_data(query)
df_frame = show_result(records)
df_frame.head(10) # Limit result

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,DRE60,6.635,low fat,0.278974,Soft Drinks,225.372,OUT019,1985,Small,Tier 1,Grocery Store,679.116
1,DRJ13,12.65,Low Fat,0.062838,Soft Drinks,161.5578,OUT013,1987,High,Tier 3,Supermarket Type1,2406.867
2,DRH01,17.5,Low Fat,0.097904,Soft Drinks,174.8738,OUT046,1997,Small,Tier 1,Supermarket Type1,2085.2856
3,DRZ11,8.85,Regular,0.113124,Soft Drinks,122.5388,OUT018,2009,Medium,Tier 3,Supermarket Type2,1609.9044
4,DRF49,7.27,Low Fat,0.071078,Soft Drinks,114.2518,OUT046,1997,Small,Tier 1,Supermarket Type1,2618.5914
5,DRK01,7.63,Low Fat,0.061053,Soft Drinks,95.4436,OUT035,2004,Small,Tier 2,Supermarket Type1,1418.154
6,DRH37,17.6,Low Fat,0.041701,Soft Drinks,164.8526,OUT045,2002,Small,Tier 2,Supermarket Type1,2302.3364
7,DRI25,19.6,Low Fat,0.03397,Soft Drinks,55.1614,OUT045,2002,Medium,Tier 2,Supermarket Type1,1381.535
8,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
9,DRF36,16.1,LF,0.023625,Soft Drinks,189.3846,OUT045,2002,Medium,Tier 2,Supermarket Type1,3630.6074


# Multi-match query:
The multi_match query builds on the match query to allow multi-field queries:

In [125]:
query = {
  "query": {
    "multi_match" : {
      "query":    "Frozen Foods", 
      "fields": [ "Item_Fat_Content", "Item_Type" ] 
    }
  }
}
records = fetch_elastic_data(query)
df_frame = show_result(records)
df_frame.head(10) # Limit result

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDG52,13.65,Low Fat,0.065733,Frozen Foods,45.7402,OUT049,1999,Medium,Tier 1,Supermarket Type1,780.9834
1,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,Medium,Tier 2,Supermarket Type1,4710.535
2,FDS52,8.89,low fat,0.005505,Frozen Foods,102.4016,OUT017,2007,Small,Tier 2,Supermarket Type1,2732.4432
3,FDX40,12.85,Low Fat,0.165694,Frozen Foods,39.7164,OUT010,1998,Small,Tier 3,Grocery Store,231.6984
4,FDD17,7.5,Low Fat,0.032678,Frozen Foods,239.0906,OUT049,1999,Medium,Tier 1,Supermarket Type1,5942.265
5,FDZ16,16.85,Regular,0.16076,Frozen Foods,192.4478,OUT017,2007,Medium,Tier 2,Supermarket Type1,4843.695
6,FDP28,13.65,Regular,0.134976,Frozen Foods,260.0936,OUT010,1998,Small,Tier 3,Grocery Store,260.9936
7,FDH28,15.85,Regular,0.110031,Frozen Foods,37.2506,OUT046,1997,Small,Tier 1,Supermarket Type1,265.6542
8,FDM40,10.195,Low Fat,0.159804,Frozen Foods,141.5154,OUT013,1987,High,Tier 3,Supermarket Type1,850.8924
9,FDQ28,14.0,Regular,0.060377,Frozen Foods,154.5656,OUT013,1987,High,Tier 3,Supermarket Type1,2471.4496
