In [2]:
from threading import Thread
import socket
import json
import sys
import pandas as pd
import re
from collections import Counter
import math
import numpy as np
from sklearn import datasets, preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift
from matplotlib import pyplot as plt
from numba import jit
from tqdm import tqdm


In [3]:
@jit(forceobj=True,target='cpu',looplift=True)
def similarity(x): #余弦相似度
    print(len(x[0]))
    N=len(x)
    res=np.zeros((N,N))
    norm=[]
    for v in x:
        norm.append(math.sqrt(np.dot(v,v)))

    for i in tqdm(range(N)):
        for j in range(i,N):
            tmp=norm[i]*norm[j]
            if tmp==0:
                res[i][j]=0
                res[j][i]=0
            else:
                res[i][j]=np.dot(x[i],x[j])/tmp
                res[j][i]=res[i][j]
    return res

class LocalServer(object):
    def __init__(self, host, port):
        self.address = (host, port)
        self.data=pd.read_csv('./data/all_news.csv')
        print(len(self.data))

        with open('./data/english', 'r', encoding='utf-8') as f:
            self.stop_words = f.readlines()  # 读取全部内容后，按行存储为list
        self.stop_words = set(i.strip('\n') for i in self.stop_words)

        processed = self.data["body"].apply(self.process) #去除标点，停用词和低频词
        
        self.data['processed']=processed
        self.vocab=self.build_words_Dictionary() #词典
        #print(self.word_dic)

        self.IDF_dict=self.IDF() #词典中的所有词的IDF，构成字典{ word1:IDF1, word2:IDF2, ...}的形式
        vec=self.data['processed'].apply(self.TF_IDF_vec) #计算文章向量，其中的值其实就是TF-IDF值
        self.data['vec']=vec 

        self.word_similarity() #从已保存的文件中获取词相似度，以做模糊匹配
        #print(self.words_similarity)

        #底下这些代码是将上面的vec取转置，然后再求相似度。
        #vec的每一行代表文章，每一列代表词，因此取转置再求相似度就是求词之间的相似度
        #由于以下代码运行要很久（20min左右），所以我只运行了其一次，并将结果存到了synonym.txt中
        #synonym.txt中存的是每个词，与其最相似的前三个词（包括它自己，所以它自己就会是第一个）
        #之后要做模糊匹配，就从synonym.txt中读取即可
        
        '''print("type(vec)==",type(vec))
        tmp=np.array([list(x) for x in vec]).T
        print(tmp.shape)
        tmp=[list(x) for x in tmp]
        #print(tmp)
        
        word_similarity=similarity(tmp)
        
        self.word_similarity=word_similarity
        self.sim_word_of_each_word=[]'''
        
        
        
        '''
        wfile=open('./synonym.txt','w')
        
        for i,sim in enumerate(word_similarity):
            loc=np.argsort(-sim)[:3]
            now_word=self.vocab[i]
            sim_word=np.array(self.vocab)[loc] #取前三个最相似的
            print(f'{now_word}:{sim_word}',file=wfile)
            self.sim_word_of_each_word.append(sim_word)
        wfile.close()
        '''

        #print(self.data)

    def word_similarity(self):
        """直接从synonym.txt中读取，每个词有三个与它最近似的词"""
        rfile=open('./synonym.txt','r')
        content=rfile.readlines()
        rfile.close()
        self.words_similarity={} #原词与其相似词们构成的字典
        for s in content:
            s=s.strip('\n')
            ls=s.split(':')
            word=ls[0] #原词
            ls[1]=ls[1][1:-1]
            ls[1]=ls[1].split(' ')
            sim_words=[eval(x) for x in ls[1]] #相似词构成的列表
            self.words_similarity[word]=sim_words 
        return 
        
    def process(self,x): #处理，去除标点，停用词和低频词
        x=re.sub('[^A-Za-z]+', ' ', x).lower()
        x = x.split(' ')
        ls = [z for z in x if z not in self.stop_words]
        cnt=Counter(ls)
        res = [z for z in ls if cnt[z]>5]
        return res
    
    def build_words_Dictionary(self):
        vocab=set()
        for pro in self.data['processed']:
            vocab|=set(pro) 
        vocab=sorted(list(vocab))

        wfile=open('./data/vocab.txt','w')
        for word in vocab:
            print(word,file=wfile)
        wfile.close()
        return vocab
    
    def IDF(self):
        res={}
        Y=len(self.data)
        for word in self.vocab:
            cnt=1
            for processed_data in self.data['processed']:
                if word in processed_data:
                    cnt+=1
            res[word]=math.log(Y/cnt)
        return res

    def TF_IDF_vec(self,data):
        res=np.zeros(len(self.vocab))
        cnt=Counter(data)
        N=len(data)
        for word in data:
            loc=self.vocab.index(word)
            tmp=self.IDF_dict[word]*cnt[word]/N
            res[loc]=tmp
        return res

    
        
    def run(self):
        """
        TODO：请在服务器端实现合理的并发处理方案，使得服务器端能够处理多个客户端发来的请求
        """
        try:
            server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            server.bind(self.address)
            server.listen(5)
        except socket.error as msg:
            print(msg)
            sys.exit(1)
        print('Waiting connection...')

        while 1:
            conn,addr=server.accept()
            print(f'{conn}connected!')
            t=Thread(target=self.TextSearch,args=(conn,addr))
            t.start()


    def TextSearch(self,conn,addr):
        
        conn.send(("Connected!").encode())
        text=conn.recv(1024).decode()
        
        #print(text)
        
        words_list=text.split(' ')
        useful_data=np.array(self.data[['title','body','vec','id']])
        res=[] #返回的检索内容
        all_title=[] #记录全部标题，从而不会将相同标题的文章两次加到检索内容中
        all_id=[]
        all_vec=[]
        new_words_list=[]
        for word in words_list:
            if word not in self.vocab:
                continue
            new_words_list.extend(self.words_similarity[word])#将原词的相似词也加到检索词的队列中

        for word in new_words_list:
            if word not in self.vocab:
                continue
            loc=self.vocab.index(word)
            for data in useful_data:
                tmp=data[2][loc]
                if data[0] not in all_title and tmp!=0:
                    res.append([data[0],data[1],tmp,data[3]-1]) #标题，正文，TF-IDF，id构成的元组，注意id减了1
                    all_title.append(data[0])
                    all_id.append(data[3]-1)
                    all_vec.append(data[2])
        
        
        '''for i,vec in enumerate(all_vec):
            all_vec[i]=vec[all_id]'''
        if all_vec!=[]:
            useful_similirity=similarity(all_vec)
            print(useful_similirity.shape)
            for i,x in enumerate(res):
                res[i][2]=sum(useful_similirity[i]) #计算该文章与其他检索到的文章的相似度的总和，并覆盖掉TF-IDF值(即x[2])
                #print(res[i][2])

            res=sorted(res,key=lambda x:x[2],reverse=True) #按照相似度总和的值降序排序
            #print(res)
            res=np.array(res,dtype=object) #将res先变成array，只取res的前两列（标题和正文），然后再返回
            res=res[:,:2]
            res=[tuple(x) for x in res]
            
            conn.send((repr(res)).encode())
            conn.close()
        else:
            conn.send(('[]').encode())
            conn.close()

        
        """
        TODO: 请补充实现文本检索，以及服务器端与客户端之间的通信
        
        1. 接受客户端传递的数据， 例如检索词
        2. 调用检索函数，根据检索词完成检索
        3. 将检索结果发送给客户端，具体的数据格式可以自己定义
        
        """
        
        
        



#### 运行服务器端
启动服务器之后，在run.ipynb中运行客户端图形界面

In [4]:
server = LocalServer('127.0.0.1', 1234)
server.run()

2225
type(vec)== <class 'pandas.core.series.Series'>
(1688, 2225)
2225


  1%|          | 20/1688 [00:15<21:02,  1.32it/s]

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(8, 4), columns = ['A', 'B', 'C', 'D'])
#print (df.iloc[[1, 3, 5], [1, 3]])
#print (df.iloc[1:3, :])
#print (df.iloc[:,1:3])
test=np.array(df)
print(test)
print(test[2,2])

[[ 0.11004468  1.07810417 -0.01778058 -0.36007987]
 [-0.101603    0.67025721 -0.12684299 -1.45139029]
 [-0.08519805 -1.02044191 -0.21979702 -0.22826723]
 [ 0.8640683   0.97690409  0.52404301  0.54047315]
 [ 0.34574981 -0.16380384 -0.34532723  1.18995295]
 [ 0.56118092  0.78218456  2.19215272 -0.69094032]
 [-0.31619402  1.11948573 -1.85927889 -0.91258706]
 [ 0.25969365  0.48056369 -0.31125274  1.36486657]]
-0.21979701569034235


In [None]:
ls=[(1,2),(3,4)]
print(ls)
print(repr(ls))
s=repr(ls)
s=eval(s)
print(s,type(s))

[(1, 2), (3, 4)]
[(1, 2), (3, 4)]
[(1, 2), (3, 4)] <class 'list'>


In [None]:
ls=[(1,2,3),(4,5,6),(7,8,9)]
print(ls)
print(sorted(ls,key=lambda x:x[2],reverse=True))
ls=np.array(ls)
print(ls)
ls=ls[:,:2]
print(ls)
ls=[tuple(x) for x in ls]
print(ls)

[(1, 2, 3), (4, 5, 6), (7, 8, 9)]
[(7, 8, 9), (4, 5, 6), (1, 2, 3)]
[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1 2]
 [4 5]
 [7 8]]
[(1, 2), (4, 5), (7, 8)]


In [None]:
import numpy as np
ls=np.array([[1,2,3],[4,5,6],[7,8,9]])
print(ls)
loc=[0,2]
print(ls[loc][:,loc])
print(sum(ls[1,loc]))
for data in ls:
    data[1]=100
print(ls,ls.shape)
print(ls[:,2].shape)
ls2=np.array([[1,2],[3,4],[5,6]])
print(ls2.shape,ls2.T.shape)
print(ls2.T)
for x in ls2.T:
    print(x)
print(ls2,[list(x) for x in ls2])

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1 3]
 [7 9]]
10
[[  1 100   3]
 [  4 100   6]
 [  7 100   9]] (3, 3)
(3,)
(3, 2) (2, 3)
[[1 3 5]
 [2 4 6]]
[1 3 5]
[2 4 6]
[[1 2]
 [3 4]
 [5 6]] [[1, 2], [3, 4], [5, 6]]


In [None]:
ls=[[1,2,3],[4,5,6],[7,8,9]]
for i,l in enumerate(ls):
    ls[i]=np.array(l)
print(ls)
print(np.array(ls))
def add(x,y):
    return x+y
new_add=np.vectorize(add)
arr=np.array([1,2,3])
print(new_add(ls,ls))
print(new_add(arr,arr.T))
print(ls)

NameError: name 'np' is not defined

In [None]:
ls=pd.Series([np.array([2,3,4]),np.array([5,6,7])])
print(ls,np.array(ls).T,np.array([list(x) for x in ls]).T,sep='\n')

0    [2, 3, 4]
1    [5, 6, 7]
dtype: object
[array([2, 3, 4]) array([5, 6, 7])]
[[2 5]
 [3 6]
 [4 7]]


In [None]:
s="['a' 'b' 'c']"
s=s[1:-1]
print(s)
ls=s.split(' ')
print(ls)
print([eval(x) for x in ls])
ls3=['a','b','c']
print(ls3)

'a' 'b' 'c'
["'a'", "'b'", "'c'"]
['a', 'b', 'c']
['a', 'b', 'c']


In [None]:
from time import sleep
class myClass:
    def __init__(self):
        for i in tqdm(range(50)):
            sleep(0.1)
m=myClass()

100%|██████████| 50/50 [00:05<00:00,  9.16it/s]
