In [1]:
import pandas as pd

In [3]:
df_search_product = pd.read_csv("/home/music/Downloads/view_b_product.csv")

In [108]:
df_search_product['BID'] = df_search_product['BID'].fillna(-1).astype(int)

In [4]:
ls_searh_field = ['PRODUCT_NAME', 'PREVIEW_TEXT', 'TAGS', 'BRAND_NAME', 'TAGS_NAME']

In [2]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

In [139]:
stop_lang_filter = {
    "stop_lang": {
    "type":       "stop",
    "stopwords":  [
        "_thai_" ,
        "_english_" 
        ]
    }
}

english_stemmer_filter = {
    "english_stemmer": {
        "type":       "stemmer",
        "language":   "english"
    }
}

english_possessive_stemmer_filter = {
    "english_possessive_stemmer": {
        "type":       "stemmer",
        "language":   "possessive_english"
    }
}

ls_filter = [
    "lowercase",
    "decimal_digit",
    "stop_lang",
    "asciifolding",
    "classic",
    "english_stemmer",
    "english_possessive_stemmer"
]

search = {
    "search": {
        "tokenizer":  "classic",
        "filter": [
            "lowercase",
            "decimal_digit",
            "asciifolding",
            "classic",
        ]
    }
}

full_search = {
    "full_search": {
        "tokenizer":  "classic",
        "filter": [
            "lowercase",
            "decimal_digit",
            "stop_lang",
            "asciifolding",
            "classic",
            "english_stemmer",
            "english_possessive_stemmer"
        ]
    }
}

full_put_search = {
    "full_put_search": {
        "tokenizer":  "classic",
        "filter": [
            "lowercase",
            "decimal_digit",
            "stop_lang",
            "asciifolding",
            "classic",
            "english_stemmer",
            "english_possessive_stemmer",
            "unique"
        ]
    }
}

put_search = {
    "put_search": {
        "tokenizer":  "classic",
        "filter": [
            "lowercase",
            "decimal_digit",
            "asciifolding",
            "classic",
            "unique"
        ]
    }
}

setting = {
    "settings": {
        "analysis": {
            "filter": {
                **stop_lang_filter, 
                **english_stemmer_filter, 
                **english_possessive_stemmer_filter
            },
            "analyzer": {
                **search,
                **put_search,
                **full_put_search,
                **full_search
            }
        }
    }
}

In [143]:
mapping = {
    "mappings":{
        "properties":{
            "search":{
                "type":"text",
                "analyzer":"put_search",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    },
                    "full_put_search": {
                        "type": "text",
                        "analyzer": "full_put_search"
                    },
                    "search": {
                        "type": "text",
                        "analyzer": "search"
                    },
                    "full_search": {
                        "type": "text",
                        "analyzer": "full_search"
                    }
                }
            },
            'product_id':{
                "type":"keyword",
            },
            'original_search':{
                "type":"text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
                    
            },
            'product_name':{
                "type":"text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            'brand_name':{
                "type":"text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            'brand_id':{
                "type":"keyword",
            },
        }
    }
}

In [59]:
es = Elasticsearch("192.168.1.97:9200")

In [144]:
name_index = "search_product"
if not es.indices.exists(index=name_index):
    es.indices.create(index=name_index, body={**setting, **mapping})

  es.indices.create(index=name_index, body={**setting, **mapping})


In [15]:
from tqdm import tqdm
import numpy as np

In [145]:
ls_bulks = []

for row in tqdm(range(len(df_search_product))):
    data = df_search_product.iloc[row]
    str_search = []
    for field in ls_searh_field:
        text = data[field]
        if pd.isna(text):
            continue
        elif type(text) != str:
            text = str(text)
        str_search.append(text)
        
    str_search = ' '.join(str_search)
    body = {
        '_id': row,
        "search": str_search,
        "product_id": data['EID'],
        'original_search': str_search,
        'product_name': data['PRODUCT_NAME'],
        'brand_name': '' if data['BRAND_NAME'] is np.nan else data['BRAND_NAME'],
        'brand_id': -1 if data['BID'] is np.nan else data['BID'],
    }
    ls_bulks.append(body)
    if row % 1000 == 999:
        bulk(es, ls_bulks, index=name_index)
        ls_bulks = []


100%|██████████| 30030/30030 [00:19<00:00, 1557.12it/s]


In [27]:
df_rev = pd.read_csv("/home/music/Desktop/measure_model/db_cosmenet/review.csv", low_memory=False)
df_rev.columns

Index(['ID', 'EID', 'BID', 'ACTION', 'COMMENT', 'REGULAR', 'ACTUAL',
       'DATE_CREATE', 'modify_date', 'TIMESTAMP_TOP', 'UID', 'AUTHOR_IP',
       'AUTHOR_REAL_IP', 'COMMENT_IMG', 'COMMENT_IMG_2', 'COMMENT_IMG_3',
       'COMMENT_IMG_4', 'COMMENT_IMG_5', 'REVIEW_SHOW', 'REVIEW_WHEN', 'POINT',
       'ENAME', 'EIMG', 'BRAND', 'TYPE', 'TYPE_ID', 'UNAME', 'UIMG',
       'LOCATION', 'EFFECT', 'D_USER', 'FID', 'campaing_name', 'SEARCH_TXT'],
      dtype='object')

In [28]:
meta_review_put = df_rev[['EID', 'COMMENT', 'POINT']].copy()
meta_review_put.drop(meta_review_put[meta_review_put['COMMENT'].isna()].index, inplace=True)
meta_review_put.head(1)

Unnamed: 0,EID,COMMENT,POINT
0,40039,ดินสอเขียนคิ้ว เมลินดา ตัวนี้ใช้มาหลายแท่งมากๆ...,5


In [21]:
mapping_review = {
    "mappings":{
        "properties":{
            'product_id':{
                "type":"keyword",
            },
            "comment":{
                "type":"text",
            },
            'point':{
                "type":"integer",
            }
        }
    }
}

In [29]:
es = Elasticsearch("192.168.1.97:9200")

In [30]:
name_index = "review_product"
if not es.indices.exists(index=name_index):
    es.indices.create(index=name_index, body={**mapping_review})

  es.indices.create(index=name_index, body={**mapping_review})


In [31]:
ls_bulks_review = []

for row in tqdm(range(len(meta_review_put))):
    data = meta_review_put.iloc[row]
        
    body = {
        '_id': row,
        "product_id": data['EID'],
        "comment": data['COMMENT'],
        'point': data['POINT'],
    }
    ls_bulks_review.append(body)
    if row % 1000 == 999:
        bulk(es, ls_bulks_review, index=name_index)
        ls_bulks_review = []
if len(ls_bulks_review) > 0:
    bulk(es, ls_bulks_review, index=name_index)

100%|██████████| 156915/156915 [00:39<00:00, 4008.73it/s]
