In [445]:
import pymysql.cursors
import pandas as pd
import numpy as np
import connect_to_db as cn
import json
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import csv
import parmap

In [446]:
# select query를 날려서 가져온 result를 DataFrame으로 반환하는 function.
def select_query_result_to_df(sql):
    connection = cn.make_connection('connection.csv')
    cursor = connection.cursor(pymysql.cursors.DictCursor)
    cursor.execute(sql)
    result = cursor.fetchall()
    connection.close()
    
    result_df = pd.DataFrame(result)
    
    return result_df

In [447]:
def check_body(document):
    p = re.compile('[a-zA-Z]')
    for text in document[:]:
        # 특수문자, 이모지 등으로만 이루어진 text를 거르기 위한 code.
        if p.search(text) is None:
            document.remove(text)
        # 특수문자와 알파벳 하나(ex. :p)로만 이루어진 text를 거르기 위한 code.
        elif len(re.sub('[^a-zA-Z]',' ',text).strip()) == 1:
            document.remove(text)
    
    return document

In [448]:
def tf_idf_similarity(document):
    document = check_body(document)

    # comment/body list가 빈 list인지를 체크해서 빈 list이면 -1 return.
    if not document:
        return -1
    
    # check_body() 함수를 거친 후, 이모지로만 이루어진 comment/post title이 지워져서
    # 하나의 text만 남았을 경우. 구분을 위해 -2 return.
    if len(document) == 1:
        return -2

    tfidf_vectorizer = TfidfVectorizer(min_df=1)
    tfidf_matrix = tfidf_vectorizer.fit_transform(document)

    document_distances = (tfidf_matrix * tfidf_matrix.T)
    
    result_array = document_distances.toarray()
    
    similarity = 0
    count = 0
    
    for i in range(len(result_array)):
        for j in range(len(result_array[i])):
            if i < j:
                similarity += result_array[i][j]
                count += 1
    
    return (similarity / count)

In [449]:
def write_similarity_csv(column, result_list, end_index):
    fields = ['author', column]
    filename = f"{column}_{end_index}.csv"
    with open(filename, 'w', newline='') as f:
        write = csv.writer(f)
        write.writerow(fields)
        write.writerows(result_list)

In [450]:
def similarity_main(index, target_table):
    # author 추출 후 list로 변환.
    sql = f'select author from mentor where comment_cnt > 1 and is_valid = 1 and {target_table}_similarity is null;'
    result_df = select_query_result_to_df(sql)
    authors = np.array(result_df['author'].astype(str).values.tolist())

    result_for_csv = []
    
    if index % 100000 == 0:
        start_index = index - 100000
    else:
        start_index = index - (index % 100000)
        
    if target_table == 'comments':
        target_column = 'body'
    else:
        target_column = 'title'
        
    for i in range(start_index, index):
        if target_table == 'comments':
            sql = f"select {target_column} from {target_table} where author = '{authors[i]}' and link_key = parent_key;"
        else:
            sql = f"select distinct p.post_key, p.{target_column} from {target_table} p, comments c where p.post_key = c.link_key and c.author = '{authors[i]}' and c.link_key = c.parent_key;";
        result_df = select_query_result_to_df(sql)
        texts = list(np.array(result_df[target_column].tolist()))
        
        similarity = tf_idf_similarity(texts)
        result_for_csv.append([authors[i], similarity])

    write_similarity_csv(f'{target_table}_similarity', result_for_csv, index)

In [438]:
# index_list = [100000, 200000, 300000, 349103]
index_list = [100000, 200000, 227032]

if __name__ == '__main__':
    # multi processing.
    parmap.map(similarity_main, index_list, 'posts', pm_pbar=True, pm_processes=4)

100%|██████████| 4/4 [05:31<00:00, 82.80s/it] 
