In [44]:
import pandas as pd
import numpy as np
import re,json

import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pc
import pandas as pd

from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### 테스트 데이터 load

In [45]:

tmp_data = [[153709120,'갤럭시 중고폰'],[153709376,'연예인 지갑 지갑'],[153710656,'갤럭시s10 갤럭시북 NT950XDZ-G58AW 중고폰']]
tmp = pd.DataFrame(tmp_data, columns=['id','name'])

tmp.head()

Unnamed: 0,id,name
0,153709120,갤럭시 중고폰
1,153709376,연예인 지갑 지갑
2,153710656,갤럭시s10 갤럭시북 NT950XDZ-G58AW 중고폰


## 1. 토큰화

In [46]:
def tokenizer(data):
    token=[]

    data = data.lower() #소문자로 변환
    words = data.split() #공백으로 분리
    #print("split ", words)

    #규칙에 해당 -> findall
    p = re.compile("[가-힣]+|[ㄱ-ㅎ|ㅏ-ㅣ]+|[a-z0-9-]+|[^ a-z0-9가-힣+]") #규칙

    for word in words:
        find = re.findall(p,word)
        for w in find:
            #token=token+w+' '
            token.append(w)
    return token

In [47]:
tmp['token']=tmp['name'].apply(tokenizer)
tmp.head()

Unnamed: 0,id,name,token
0,153709120,갤럭시 중고폰,"[갤럭시, 중고폰]"
1,153709376,연예인 지갑 지갑,"[연예인, 지갑, 지갑]"
2,153710656,갤럭시s10 갤럭시북 NT950XDZ-G58AW 중고폰,"[갤럭시, s10, 갤럭시북, nt950xdz-g58aw, 중고폰]"


## 2.inverted index -> 파일(.parquet)생성

In [48]:
js = tmp.to_json(orient = 'records')
json_data =json.loads(js)
json_data

[{'id': 153709120, 'name': '갤럭시 중고폰', 'token': ['갤럭시', '중고폰']},
 {'id': 153709376, 'name': '연예인 지갑 지갑', 'token': ['연예인', '지갑', '지갑']},
 {'id': 153710656,
  'name': '갤럭시s10 갤럭시북 NT950XDZ-G58AW 중고폰',
  'token': ['갤럭시', 's10', '갤럭시북', 'nt950xdz-g58aw', '중고폰']}]

In [78]:

index_dict=defaultdict(list)

for data in json_data:
    for token in data['token']:
            index_dict[token].append(data['id'])

print(index_dict)

p=pd.DataFrame(list(index_dict.items()),columns=['token', 'docu_list'])

defaultdict(<class 'list'>, {'갤럭시': [153709120, 153710656], '중고폰': [153709120, 153710656], '연예인': [153709376], '지갑': [153709376, 153709376], 's10': [153710656], '갤럭시북': [153710656], 'nt950xdz-g58aw': [153710656]})


In [79]:
table = pa.Table.from_pandas(p)
pq.write_table(table, './test_data/test_index.parquet')

In [80]:
index_df = pq.read_table('./test_data/test_index.parquet').to_pandas()
index_df.head()

Unnamed: 0,token,docu_list
0,갤럭시,"[153709120, 153710656]"
1,중고폰,"[153709120, 153710656]"
2,연예인,[153709376]
3,지갑,"[153709376, 153709376]"
4,s10,[153710656]


## 3.tf-idf 적용 (TfidfVectorizer ) -> 파일(.parquet)생성

In [65]:
## token 많아질수록 벡터의 차원이 커지는 문제
## 가장 많이 나온 단어 n개만 사용하는 max_features 파라미터 : TfidfVectorizer(max_features=4)
## all_teokn = tmp['token ] : list

vect2 = TfidfVectorizer(tokenizer=tokenizer)
tfvect_matrix = vect2.fit_transform(tmp['name'])
tfvect_matrix.toarray()

array([[0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.70710678, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.4472136 ,
        0.        , 0.89442719],
       [0.49047908, 0.49047908, 0.37302199, 0.49047908, 0.        ,
        0.37302199, 0.        ]])

In [66]:
tfvect_matrix.shape #문서id X token

(3, 7)

In [67]:
vect2.idf_

array([1.69314718, 1.69314718, 1.28768207, 1.69314718, 1.69314718,
       1.28768207, 1.69314718])

### TfidfVectorizer로 적용하면, feature name이 일부변경되는 문제
- 예를들어, 토큰값 nt950xdz-g58aw 값이 g58aw,nt950x로 분리...
- 원인 : 사이킷런 내부에서 지정한 단어사전으로 만들어서
- 해결 : TfidfVectorizer의 tokenizer옵션 적용

In [68]:
tfidf_col = vect2.get_feature_names() 
tfidf_col

['nt950xdz-g58aw', 's10', '갤럭시', '갤럭시북', '연예인', '중고폰', '지갑']

In [69]:
## tf-idf적용한 결과 df로 생성 

tfidv_df = pd.DataFrame(tfvect_matrix.toarray(), index = list(tmp['id']), columns = sorted(tfidf_col))
tfidv_df

Unnamed: 0,nt950xdz-g58aw,s10,갤럭시,갤럭시북,연예인,중고폰,지갑
153709120,0.0,0.0,0.707107,0.0,0.0,0.707107,0.0
153709376,0.0,0.0,0.0,0.0,0.447214,0.0,0.894427
153710656,0.490479,0.490479,0.373022,0.490479,0.0,0.373022,0.0


In [70]:
table = pa.Table.from_pandas(tfidv_df)
pq.write_table(table, './test_data/test_tfidv.parquet')

In [75]:
index_df = pq.read_table('./test_data/test_tfidv.parquet').to_pandas()
index_df.head()

Unnamed: 0,nt950xdz-g58aw,s10,갤럭시,갤럭시북,연예인,중고폰,지갑
153709120,0.0,0.0,0.707107,0.0,0.0,0.707107,0.0
153709376,0.0,0.0,0.0,0.0,0.447214,0.0,0.894427
153710656,0.490479,0.490479,0.373022,0.490479,0.0,0.373022,0.0


## 4. 검색어(query) 테스트

- 입력한 키워드 토큰화 리스트 : q_token
- reverted_index로 입력한 quey가 있는 문서id 리스트 리턴  
- 문서들의 교집합을 찾음 : q_documents 
<br><br>
- 교집합 문서들의 name : search_dc
- 교집합 문서들의 td-idf(score) : search_tf
- search_dc와 search_tf merge : search
- score기준 내림차순 정렬 
<br><br>
- 최종결과 response message

In [72]:
q = "아이폰 중고폰"
#입력한 키워드 토큰화 
token_list = tokenizer(q)
print(token_list)

['아이폰', '중고폰']


In [73]:
# index_dict 그대로 사용한 ver 

# query가 들어있는 문서id
q_documents=[]
for tk in token_list:
    q_documents.append(set(index_dict[tk])) 
print(q_documents)

# 문서들의 교집합 
query_documents = list(q_documents[0].intersection(*q_documents))
print(query_documents)

[set(), {153709120, 153710656}]
[]


### token에 없는 키워드로 검색했을때 문제 
- 키워드로 검색했는데 token에 없는 경우
- 키워드로 검색한 토큰리스트 중에 일부만 token에 있는 경우도 있을 수 있어서 처리 ->new_token_list생성
- 문서들의 교집합 없을 수도 있어서 처리 

In [77]:
index_df

Unnamed: 0,nt950xdz-g58aw,s10,갤럭시,갤럭시북,연예인,중고폰,지갑
153709120,0.0,0.0,0.707107,0.0,0.0,0.707107,0.0
153709376,0.0,0.0,0.0,0.0,0.447214,0.0,0.894427
153710656,0.490479,0.490479,0.373022,0.490479,0.0,0.373022,0.0


In [76]:
# 0831 : ndex_dict 파일로 바꾼 ver - 없는 키워드 입력 고려 

search_token =index_df[index_df['token'].isin(token_list)]
#print(search_token)

new_token_list = list(search_token['token'])
print(new_token_list) 

if len(new_token_list) == 0:
    print("없는 키워드")

q_documents=[]
for docu_list in search_token['docu_list']:
    q_documents.append(set(docu_list))

# 문서들의 교집합 
query_documents = list(q_documents[0].intersection(*q_documents))
print(len(query_documents))

if len(query_documents) == 0:
    print("교집합없음")

KeyError: 'token'

In [42]:
#교집합 문서들의 id,name
search_dc = tmp[tmp['id'].isin(query_documents)]
search_dc=search_dc.set_index('id')
search_dc

#교집합 문서들에 대해서 tf-dif(score값)
#tfidv_df.loc[query_documents] #교집합문서
search_tf = tfidv_df.loc[query_documents][new_token_list] ## 여기에서 token_list에서 tokne에 없는게 있음 

search_tf['score'] = search_tf.sum(axis=1)
search_tf

# search_dc와 search_tf join (by id)
search=search_tf.join(search_dc,how='inner')
search['pid']=search.index
search.sort_values(by=['score'],ascending=[False],inplace=True) #score기준 정렬
search

Unnamed: 0_level_0,name,token
id,Unnamed: 1_level_1,Unnamed: 2_level_1
153709120,갤럭시 중고폰,"[갤럭시, 중고폰]"
153710656,갤럭시s10 갤럭시북 NT950XDZ-G58AW 중고폰,"[갤럭시, s10, 갤럭시북, nt950xdz-g58aw, 중고폰]"


Unnamed: 0,중고폰,score
153709120,0.707107,0.707107
153710656,0.373022,0.373022


Unnamed: 0,중고폰,score,name,token,pid
153709120,0.707107,0.707107,갤럭시 중고폰,"[갤럭시, 중고폰]",153709120
153710656,0.373022,0.373022,갤럭시s10 갤럭시북 NT950XDZ-G58AW 중고폰,"[갤럭시, s10, 갤럭시북, nt950xdz-g58aw, 중고폰]",153710656


In [43]:
# 최종결과 
response = search[['pid','name','score']]
response

# response msg
js = response.to_json(orient='records')
res_data =json.loads(js)
res_data

Unnamed: 0,pid,name,score
153709120,153709120,갤럭시 중고폰,0.707107
153710656,153710656,갤럭시s10 갤럭시북 NT950XDZ-G58AW 중고폰,0.373022


[{'pid': 153709120, 'name': '갤럭시 중고폰', 'score': 0.7071067812},
 {'pid': 153710656,
  'name': '갤럭시s10 갤럭시북 NT950XDZ-G58AW 중고폰',
  'score': 0.3730219859}]