In [1]:
import pymysql.cursors
import pandas as pd
import numpy as np
import connect_to_db as cn
import json
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import csv
import parmap

In [2]:
def check_body(document):
    p = re.compile('[a-zA-Z]')
    for text in document[:]:
        # 특수문자, 이모지 등으로만 이루어진 text를 거르기 위한 code.
        if p.search(text) is None:
            document.remove(text)
        # 특수문자와 알파벳 하나(ex. :p)로만 이루어진 text를 거르기 위한 code.
        elif len(re.sub('[^a-zA-Z]',' ',text).strip()) == 1:
            document.remove(text)
    
    return document

In [3]:
def tf_idf_similarity(sql, column_name):
    result_df = cn.select_query_result_to_df(sql)
    document = list(np.array(result_df[column_name].tolist()))
    document = check_body(document)

    # comment/body list가 빈 list인지를 체크해서 빈 list이면 -1 return.
    if not document:
        return -1
    
    # check_body() 함수를 거친 후, 이모지로만 이루어진 comment/post title이 지워져서
    # 하나의 text만 남았을 경우. 구분을 위해 -2 return.
    if len(document) == 1:
        return -2

    tfidf_vectorizer = TfidfVectorizer(min_df=1)
    tfidf_matrix = tfidf_vectorizer.fit_transform(document)

    document_distances = (tfidf_matrix * tfidf_matrix.T)
    
    result_array = document_distances.toarray()
    
    similarity = 0
    count = 0
    
    for i in range(len(result_array)):
        for j in range(len(result_array[i])):
            if i < j:
                similarity += result_array[i][j]
                count += 1
    
    return (similarity / count)

In [None]:
def similarity_main(index):
    # similarity column들은 모두 null 값이라 가정하고 실행.
    # sql = f'select author from mentor where comment_cnt > 1 and is_valid = 1;'
    sql = f'select author from mentor where comment_cnt > 1;'
    result_df = cn.select_query_result_to_df(sql)
    authors = np.array(result_df['author'].astype(str).values.tolist())

    result_for_csv = []
    
    if index % 100000 == 0:
        start_index = index - 100000
    else:
        start_index = index - (index % 100000)
        
    for i in range(start_index, index):
        sql1 = f"select body from comments c, posts p where p.post_key = c.link_key and c.author = '{authors[i]}' and link_key = parent_key and p.is_valid_author=1;"
        comments_similarity = tf_idf_similarity(sql1, 'body')
        
        sql2 = f"select distinct p.post_key, p.title from posts p, comments c where p.post_key = c.link_key and c.author = '{authors[i]}' and c.link_key = c.parent_key and p.is_valid_author=1;";
        posts_similarity = tf_idf_similarity(sql2, 'title')        
        
        result_for_csv.append([authors[i], comments_similarity, posts_similarity])

    fields = ['author', 'comments_similarity', 'posts_similarity']
    cn.write_csv_for_db_update(f'similarity_final_{index}.csv', fields, result_for_csv)

In [16]:
def posts_similarity_main(index):
    sql = f'select post_key from about_posts where comment_cnt > 1;'
    result_df = cn.select_query_result_to_df(sql)
    post_keys = np.array(result_df['post_key'].astype(str).values.tolist())
    
    result_for_csv = []
    
    if index % 100000 == 0:
        start_index = index - 100000
    else:
        start_index = index - (index % 100000)
        
    for i in range(start_index, index):
        sql = f"select body from comments where link_key = '{post_keys[i]}' and link_key = parent_key and is_valid=1 and is_valid_author=1;"
        comments_similarity = tf_idf_similarity(sql, 'body')
        
        result_for_csv.append([post_keys[i], comments_similarity])
    
    fields = ['post_key', 'comments_similarity']
    cn.write_csv_for_db_update(f"about_posts_comments_similarity_{index}.csv", fields, result_for_csv)

In [26]:
# index_list = [100000, 200000, 300000, 349103]
# index_list = [100000, 200000, 227032]
# index_list = [100000, 200000, 300000, 359737]
index_list = [100000, 200000, 224406]


if __name__ == '__main__':
    # multi processing.
    parmap.map(similarity_main, index_list, pm_pbar=True, pm_processes=3)
    # parmap.map(posts_similarity_main, index_list, pm_pbar=True, pm_processes=4)

 33%|███▎      | 1/3 [05:04<10:09, 304.97s/it]Process ForkPoolWorker-54:
Process ForkPoolWorker-56:
Process ForkPoolWorker-55:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 356, in get
    res = self._reader.recv_bytes()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/usr/lib/python3.8/multiprocessing/pool.py", lin

KeyboardInterrupt: 

  File "/usr/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with s

In [22]:
def random_similarity_main():
    result_for_csv = []
    
    sql1 = "select body from comments where author not in ('', 'AutoModerator', 'greatyellowshark', 'aristapop', 'Pi25', 'SeriousSamStone', 'Umbresp', 'CUTIEJUDY', 'Imissmemom', 'aagee', 'noonches', 'Vientodecara', 'Jolojil', 'sciential84', 'gurneyhallack', 'remyschnitzel', 'analemmaro', 'spud_simon_salem', 'one_moody_girl', 'VivaSisyphus', 'apparentlycompetent', 'zooline', 'yellowmix', 'TheYellowRose', 'Svataben', 'SQLwitch', 'circinia', 'sofar1776', 'Takei_Me') or author not like '%bot' order by rand() limit 1000;"
    random_comments_similarity = tf_idf_similarity(sql1, 'body')
    
    sql2 = "select p.title from posts p, comments c where p.post_key = c.link_key and (c.author not in ('', 'AutoModerator', 'greatyellowshark', 'aristapop', 'Pi25', 'SeriousSamStone', 'Umbresp', 'CUTIEJUDY', 'Imissmemom', 'aagee', 'noonches', 'Vientodecara', 'Jolojil', 'sciential84', 'gurneyhallack', 'remyschnitzel', 'analemmaro', 'spud_simon_salem', 'one_moody_girl', 'VivaSisyphus', 'apparentlycompetent', 'zooline', 'yellowmix', 'TheYellowRose', 'Svataben', 'SQLwitch', 'circinia', 'sofar1776', 'Takei_Me') or c.author not like '%bot') and p.is_valid=1 order by rand() limit 1000;"
    random_posts_similarity = tf_idf_similarity(sql2, 'title')
    
    result_for_csv.append([random_comments_similarity, random_posts_similarity])
    fields = ['random_comments_similarity', 'random_posts_similarity']
    cn.write_csv_for_db_update('random_similarity.csv', fields, result_for_csv)

In [12]:
random_similarity_main()