In [1]:
import threading
import socket
import json

In [2]:
import pandas as pd
import re

# 读入文章相似度和近义词库
cos_sim = pd.read_csv('cos_sim.csv')
cos_sim = cos_sim.drop(columns=['Unnamed: 0'])
word_sim_file = open('synonym.txt', mode='r')
synonym = re.split(':\ |,\ |\'', word_sim_file.read()[1:-1])
synonym = [item for item in filter(lambda x: x != '', synonym)]

In [3]:
import numpy as np

# HITS算法，获得authority和hub值较高的文章，并排序
def hits(newsid):
    idset = set()
    # 根据余弦相似度筛选相关文章
    for id in newsid:
        for other in range(2225):
            if cos_sim.iloc[id][other] >= 0.8:
                idset.add(other)
    idset = list(idset)
    idset.sort()
    l = len(idset)
    print(l)
    dic = {}
    for i in range(l):
        dic[idset[i]] = i
    matrix = np.zeros((l, l))
    # 初始化
    for i in range(l):
        for j in range(l):
            matrix[i][j] = (cos_sim.iloc[idset[i]][idset[j]] >= 0.8)
    h, a = np.ones(l), np.ones(l)
    # 迭代
    for _ in range(20):
        a = matrix.T @ h
        h = matrix @ a
        a = a / np.linalg.norm(a)
        h = h / np.linalg.norm(h)
    idset.sort(key=lambda x: h[dic[x]]**2+a[dic[x]]**2, reverse=True)
    # print(idset)
    return idset

In [4]:
class LocalServer(object):
    def __init__(self, host, port):
        self.address = (host, port)

    def run(self):
        server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        server.bind(self.address)
        server.listen(5)
        
        """
        TODO：请在服务器端实现合理的并发处理方案，使得服务器端能够处理多个客户端发来的请求
        """
    
        """
        TODO: 请补充实现文本检索，以及服务器端与客户端之间的通信
        
        1. 接受客户端传递的数据， 例如检索词
        2. 调用检索函数，根据检索词完成检索
        3. 将检索结果发送给客户端，具体的数据格式可以自己定义
        
        """
        lock = threading.Lock()
        while True:
            conn, addr = server.accept()
            # 使用线程处理并发问题
            t = threading.Thread(target=self.handle, args=(conn, addr))
            t.start()

    def handle(self, conn, addr):
        print("Connected by ", addr)
        terms = conn.recv(100).decode('utf-8', errors='ignore')
        terms = terms.split(' ')
        terms = [i for i in terms]
        l = len(terms)
        # 获取相似词
        for i in range(l):
            if not (terms[i] in synonym):
                continue
            idx = synonym.index(terms[i])
            for word in synonym[idx+1].split(','):
                terms.append(word)
        print(terms)
        table = pd.read_csv('data.csv')
        # 统计出现频率(加权)
        table['freq'] = table["new_body"].apply(self.getcnt, args=(terms,))
        table = table.sort_values('freq', ascending=False)
        topid = table.iloc[:8]['id']
        # 进行HITS计算
        sortid = hits(topid.index)
        table = table.sort_values('id', ascending=True)
        sendback = []
        # 选出前十名
        for k in sortid[:10]:
            sendback.append(table.iloc[k]['title'])
            sendback.append(table.iloc[k]['body'])
        conn.send('~'.join(sendback).encode('utf-8'))

    # 统计检索词和相似词出现频率
    def getcnt(self, x, terms):
        weight = 0
        wordlist = x.split(' ')
        for k in range(len(terms)):
            weight += wordlist.count(terms[k])*(2**(8-k))
        return weight

    

#### 运行服务器端
启动服务器之后，在run.ipynb中运行客户端图形界面

In [5]:
server = LocalServer("0.0.0.0", 1234)
server.run()

Connected by  ('127.0.0.1', 57888)
['child', 'school', 'parent', 'university']
Connected by  ('127.0.0.1', 57891)
['tax', 'income', 'budget', 'duty']
24
61
Connected by  ('127.0.0.1', 57923)
['child', 'school', 'parent', 'university']
24
Connected by  ('127.0.0.1', 57926)
['tax', 'income', 'budget', 'duty']
61
Connected by  ('127.0.0.1', 57929)
['child', 'school', 'parent', 'university']
24
Connected by  ('127.0.0.1', 57932)
['tax', 'income', 'budget', 'duty']
Connected by  ('127.0.0.1', 57935)
['child', 'school', 'parent', 'university']
61
24
