In [1]:
import pandas as pd
from sqlalchemy import create_engine
import yaml
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
config = yaml.safe_load(open('dart_config.yml'))
db_url = 'mysql://' + config['DB_USER'] + ':' + config['DB_USER_PASSWORD'] + '@' + config['DB_HOST'] \
                            + '/' + config['DB_NAME'] + '?charset=utf8'
engine = create_engine(db_url, encoding='utf-8')
conn = engine.connect()
company_list = pd.read_sql_table(config['DB_TABLE_NAME_1'], con=conn, 
                          columns=['name'])['name']
company_list = company_list.drop_duplicates().reset_index(drop=True)
company_list.head()

0    롯데케미칼
1     대한유화
2     이화산업
3    디케이앤디
4     송원산업
Name: name, dtype: object

In [3]:
# Read csv
trade_df = pd.read_csv("./data/trade.csv", encoding="utf-8", parse_dates=['ISSUEDT'],
                       dtype={'HSCD': str, "QTY": float, 'BYRADDR2': str, 'SPLYADDR2': str})
trade_df.head()

Unnamed: 0,HSCD,QTY,QTYUNIT,AMT,CUR,USDUNTPRICE,UNTPRICE,ITEM,BYRORGNM1,BYRPRTNUM,...,BYRADDR2,SPLYORGNM1,SPLYPRTNUM,SPLYADDR1,SPLYADDR2,ISSUEDT,TOTQTY,TOTQTYUNIT,TOTAMT,TOTAMTCUR
0,7210491090,84000.0,KG,45360.0,USD,540.0,540.0,GI COIL,주식회사 포스코대우,1318555320,...,주식회사 포스코대우,POSCO,5068100017,"1 GOEDONG-DONG, NAM-GU",,2018-02-27,565475000.0,KG,257732900.0,USD
1,7210491090,1463000.0,KG,821355.0,USD,585.0,585.0,GI COIL,주식회사 포스코대우,1318555320,...,주식회사 포스코대우,POSCO,5068100017,"1 GOEDONG-DONG, NAM-GU",,2018-02-27,565475000.0,KG,257732900.0,USD
2,7210491090,600000.0,KG,398580.0,USD,664.3,664.3,HG COIL - POSCO,주식회사 포스코대우,1318555320,...,주식회사 포스코대우,POSCO,5068100017,"1 GOEDONG-DONG, NAM-GU",,2018-02-27,565475000.0,KG,257732900.0,USD
3,7225509000,2398000.0,KG,416301.78,USD,501.15,501.15,CR COIL (0.5MM<=T<1.0MM),주식회사 포스코대우,1318555320,...,주식회사 포스코대우,POSCO,5068100017,"1 GOEDONG-DONG, NAM-GU",,2018-02-27,565475000.0,KG,257732900.0,USD
4,7209169000,23100000.0,KG,8134262.0,USD,490.0,490.0,CR COIL (3MM<=T),주식회사 포스코대우,1318555320,...,주식회사 포스코대우,POSCO,5068100017,"1 GOEDONG-DONG, NAM-GU",,2018-02-27,565475000.0,KG,257732900.0,USD


In [4]:
# Parse stopword list
buyer_list = trade_df['BYRORGNM1'].str.replace("주식회사", "").str.replace("\(주\)", "")\
        .str.replace("\(유\)", "").str.replace("㈜", "").str.replace(" ", "")
buyer_list.head()

0    포스코대우
1    포스코대우
2    포스코대우
3    포스코대우
4    포스코대우
Name: BYRORGNM1, dtype: object

In [5]:
# Parse stopword list
supplier_list = trade_df['SPLYORGNM1'].str.replace("주식회사", "").str.replace("\(주\)", "")\
        .str.replace("\(유\)", "").str.replace("㈜", "").str.replace(" ", "")
supplier_list.head()

0    POSCO
1    POSCO
2    POSCO
3    POSCO
4    POSCO
Name: SPLYORGNM1, dtype: object

In [6]:
print('company list:', len(company_list), 'buyer list:', len(buyer_list), 'supplier list:', len(supplier_list))

company list: 796 buyer list: 2492786 supplier list: 2492786


In [7]:
merge_list = company_list.append(buyer_list, ignore_index=True).append(supplier_list, ignore_index=True)

In [8]:
len(merge_list)

4986368

In [9]:
# make tfidf matrix set
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2,3))
tfidf_matrix = tfidf.fit_transform(merge_list)

In [10]:
tfidf_matrix.shape

(4986368, 92828)

In [11]:
# Set company for comparing
company_i = 0
cosine_sim = linear_kernel(tfidf_matrix[company_i], tfidf_matrix)
print(merge_list[company_i])

롯데케미칼


In [13]:
# Result comparing
sorted_index = sorted(range(len(cosine_sim[0])), key=lambda i: cosine_sim[0][i])[:]
sorted_index.reverse()
for i, v in enumerate(sorted_index):
    print(str(i), str(v), merge_list[v], str(cosine_sim[0][v]))
    if (cosine_sim[0][v] < 0.4):
        break

0 4850826 롯데케미칼 0.9999999999999998
1 4850757 롯데케미칼 0.9999999999999998
2 4850756 롯데케미칼 0.9999999999999998
3 4850755 롯데케미칼 0.9999999999999998
4 4850754 롯데케미칼 0.9999999999999998
5 4850753 롯데케미칼 0.9999999999999998
6 4850749 롯데케미칼 0.9999999999999998
7 4850748 롯데케미칼 0.9999999999999998
8 4850747 롯데케미칼 0.9999999999999998
9 4850746 롯데케미칼 0.9999999999999998
10 4850745 롯데케미칼 0.9999999999999998
11 4850736 롯데케미칼 0.9999999999999998
12 4850735 롯데케미칼 0.9999999999999998
13 4850734 롯데케미칼 0.9999999999999998
14 4850733 롯데케미칼 0.9999999999999998
15 4850732 롯데케미칼 0.9999999999999998
16 4850731 롯데케미칼 0.9999999999999998
17 4850730 롯데케미칼 0.9999999999999998
18 4850715 롯데케미칼 0.9999999999999998
19 4850714 롯데케미칼 0.9999999999999998
20 4850710 롯데케미칼 0.9999999999999998
21 4850709 롯데케미칼 0.9999999999999998
22 4850708 롯데케미칼 0.9999999999999998
23 4850706 롯데케미칼 0.9999999999999998
24 4850705 롯데케미칼 0.9999999999999998
25 4850704 롯데케미칼 0.9999999999999998
26 4850703 롯데케미칼 0.9999999999999998
27 4850702 롯데케미칼 0.9999999999999998
28

644 1990328 롯데케미칼 0.9999999999999998
645 1990327 롯데케미칼 0.9999999999999998
646 1990326 롯데케미칼 0.9999999999999998
647 1990275 롯데케미칼 0.9999999999999998
648 1990274 롯데케미칼 0.9999999999999998
649 1990261 롯데케미칼 0.9999999999999998
650 1990260 롯데케미칼 0.9999999999999998
651 1990259 롯데케미칼 0.9999999999999998
652 1990258 롯데케미칼 0.9999999999999998
653 1990257 롯데케미칼 0.9999999999999998
654 1990256 롯데케미칼 0.9999999999999998
655 1990255 롯데케미칼 0.9999999999999998
656 1990254 롯데케미칼 0.9999999999999998
657 1990253 롯데케미칼 0.9999999999999998
658 1990252 롯데케미칼 0.9999999999999998
659 1990248 롯데케미칼 0.9999999999999998
660 1990247 롯데케미칼 0.9999999999999998
661 1990246 롯데케미칼 0.9999999999999998
662 1990245 롯데케미칼 0.9999999999999998
663 1990244 롯데케미칼 0.9999999999999998
664 1990243 롯데케미칼 0.9999999999999998
665 1990242 롯데케미칼 0.9999999999999998
666 1990241 롯데케미칼 0.9999999999999998
667 1990240 롯데케미칼 0.9999999999999998
668 1990239 롯데케미칼 0.9999999999999998
669 1990236 롯데케미칼 0.9999999999999998
670 1990235 롯데케미칼 0.9999999999999998
6

In [39]:
company_i = 2
cosine_sim = linear_kernel(test_matrix[company_i], tfidf_matrix)
print(test_name[company_i])

dk&d


In [40]:
sorted_index = sorted(range(len(cosine_sim[0])), key=lambda i: cosine_sim[0][i])[-100:]
sorted_index.reverse()
for i, v in enumerate(sorted_index):
    print(str(i), str(v), csv_company_series[v], str(cosine_sim[0][v]))

0 69325 주식회사 디케이앤디 (DK&D CO.LTD) 0.4857910369292537
1 68821 주식회사 디케이앤디(DK&D Co.Ltd) 0.4857910369292537
2 59802 주식회사 디케이앤디(DK&D CO.Ltd) 0.4857910369292537
3 46092 K&C 0.2529445494267581
4 4030 K&J 0.23562164160311044
5 48604 주식회사 DKS 0.1993078875100243
6 97659 DKR 0.19073495492699127
7 26947 DKCS 0.16034756490524588
8 83710 K&B상사 0.15988331991986116
9 69913 K & B상사 0.15988331991986116
10 53193 DK상사 0.15365185711802878
11 13329 DK섬유 0.14710257071660907
12 18538 동신E&D 0.14653898290149303
13 54742 DK동신주식회사 0.14363428710612208
14 9307 DK글로벌 0.14361452036587688
15 53789 신흥F&D 0.14264290404957086
16 37092 (유)성지F&D 0.1421697633855379
17 69694 (주)대도F&D 0.14195148542072344
18 14655 DK 강업  0.14087353496409583
19 71145 청해S&D 0.13969621625075196
20 69335 평화 M&D 0.13939114531895824
21 18201 가람D&D 0.13865147326419763
22 93943 한림 W&D 0.13792107972818643
23 81165 한림W&D 0.13792107972818643
24 54923 dk메디칼 0.13529022340477692
25 788 DK솔루션 0.13488729211184328
26 68653 엠제이R&D 0.13416354598019967
27 79581 K&

In [43]:
db_company_df = pd.read_sql_table(config['DB_TABLE_NAME_1'], con=conn, index_col='id', 
                          columns=['name'])
db_company_df.head()

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
1,롯데케미칼
2,대한유화
3,이화산업
4,디케이앤디
5,송원산업


In [45]:
company_series = db_company_df['name'].drop_duplicates().reset_index(drop=True)
company_series.head()

0    롯데케미칼
1     대한유화
2     이화산업
3    디케이앤디
4     송원산업
Name: name, dtype: object