In [26]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, BulkIndexError

In [27]:
# 데이터 읽기
first_purchase_path = r"/home/user1/project/shop_Modeling/data/e-commerce_data/client_first_purchase_date.csv"
first_purchase = pd.read_csv(first_purchase_path)

In [28]:
# 데이터프레임의 크기 확인
print("데이터프레임의 크기:", first_purchase.shape)

데이터프레임의 크기: (1854736, 2)


In [29]:
# 중복된 행 제거
first_purchase.drop_duplicates(inplace=True)

In [30]:
# 데이터프레임의 크기 확인
print("데이터프레임의 크기:", first_purchase.shape)

데이터프레임의 크기: (1854736, 2)


In [31]:
first_purchase.head()

Unnamed: 0,client_id,first_purchase_date
0,1515915625761548908,2022-03-04
1,1515915625638660959,2022-04-06
2,1515915625490249330,2022-03-03
3,1515915625728802311,2021-12-20
4,1515915625915446913,2023-01-16


In [32]:
column_types = first_purchase.dtypes
print(column_types)

client_id               int64
first_purchase_date    object
dtype: object


In [33]:
# # Elasticsearch 서버 연결
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])

In [34]:
# Elasticsearch 서버에 ping 요청 보내기
if es.ping():
    print("Elasticsearch 서버에 성공적으로 연결되었습니다.")
else:
    print("Elasticsearch 서버에 연결하지 못했습니다.")

Elasticsearch 서버에 성공적으로 연결되었습니다.


In [35]:
# # 인덱스가 이미 존재하는 경우 삭제
# if es.indices.exists(index="e_campaigns"):
#     es.indices.delete(index="e_campaigns")

In [36]:
# Elasticsearch에 색인
e_first_purchase = 'e_first_purchase'  # 원하는 색인명으로 변경

# Elasticsearch 인덱스 매핑 설정
mapping = {
    "mappings": {
        "properties": {
            "client_id" : {"type": "integer"},
            "first_purchase_date" : {"type": "date", "format": "yyyy-MM-dd"},  # 날짜 필드
        }
    }
}

In [37]:
es.indices.create(index="e_first_purchase", body=mapping, ignore=400)

  es.indices.create(index="e_first_purchase", body=mapping, ignore=400)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'e_first_purchase'})

In [38]:
# Elasticsearch에 이미 색인된 데이터 삭제
es.indices.delete(index="e_first_purchase", ignore=[400, 404])

  es.indices.delete(index="e_first_purchase", ignore=[400, 404])


ObjectApiResponse({'acknowledged': True})

In [39]:
# 큰 CSV 파일을 청크로 나누어 읽고 Elasticsearch에 색인
chunk_size = 1000

for chunk in pd.read_csv(first_purchase_path, encoding='utf-8', chunksize=chunk_size):
    documents = []

    # 각 청크에 대한 처리
    for _, row in chunk.iterrows():
        document = row.to_dict()

        # NaN이 아닌 경우에만 날짜를 ISO 형식으로 변환
        for key, value in document.items():
            if pd.notna(value) and key.endswith('_at'):
                document[key] = pd.to_datetime(value).isoformat()

        # NaN을 None으로 대체
        document = {key: (None if pd.isna(value) else value) for key, value in document.items()}

        documents.append({
            "_op_type": "index",
            "_index": e_first_purchase,
            "_source": document
        })

    # Elasticsearch에 색인
    try:
        success, failed = bulk(es, documents)
        print(f"Indexed {success} documents successfully.")
        if failed:
            for i, failure in enumerate(failed):
                print(f"Failed to index document {i + 1}: {failure}")
    except BulkIndexError as e:
        print(f"Error indexing documents: {e}")

print("색인이 완료되었습니다.")

Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
Indexed 1000 documents successfully.
I