In [1]:
import json
import nltk
import math
import numpy as np

### 将文本加载为json格式

In [2]:
input_file = 'tweets.txt'
with open(input_file, 'r') as f:
    lines = f.readlines()
items = [json.loads(x) for x in lines]
tweets = [x['text'] for x in items]
N = len(tweets)

### 定义预处理类
* 大写转小写
* 分词
* 去除标点符号和停用词

In [3]:
class Preprocess:
    def __init__(self):
        self.punctuations = [',',':','_','!','\"','*','>','<','@','~','-','(',')','%','=','\\','^','&','|','#','$','[',']','+',':','#','|'] 
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
    def __call__(self,text,query=False):
        text = text.lower()
        text = nltk.word_tokenize(text)
        text = [x for x in text if x not in self.punctuations and x not in self.stop_words]
        return text

In [4]:
preprocess = Preprocess()
tokens = [preprocess(x) for x in tweets] # 每个doc的分词
length = [nltk.FreqDist(x) for x in tokens] # 每个doc的词频统计信息
length = [list(x.values()) for x in length] # 每个doc的tf向量
length = [math.sqrt(sum([tf*tf for tf in x])) for x in length] # 每个doc的tf向量的L2范数

### 统计词频

In [5]:
dictionary = {}
# posting 包括文档编号和tf
for i, token in enumerate(tokens):
    term_freq = nltk.FreqDist(token)
    for term, freq in term_freq.items():
        if term in dictionary:
            dictionary[term].add((i,freq))
        else:
            dictionary[term] = {(i,freq)}
# 按tf由大到小排序
for k,v in dictionary.items():
    dictionary[k] = sorted(list(dictionary[k]),key=lambda x:(x[1],x[0]),reverse=True)

def get_postings(term):
    if term in dictionary:
        return dictionary[term]
    else:
        return []

### 定义查询运算

In [6]:
def query_parse(query):
    query = query.lower()
    tokens = nltk.word_tokenize(query)
    postings = set()
    for token in tokens:
        postings.update(get_postings(token))
    postings = list(postings)
    postings.sort(key=lambda x:(x[1],x[0]),reverse=True)
    return postings

# 带红色强调字体输出
def toRed( s ):
    return "%c[31;2m%s%c[0m"%('\033', s, '\033')
def print_with_emphasize(line, Q):
    Q = Q.lower()
    line = nltk.word_tokenize(line)
    s_line = nltk.word_tokenize(Q)
    to_be_print = ''
    for l in line:
        if l.lower() in s_line:
            to_be_print += toRed(l) + ' '
        else:
            to_be_print += l + ' '
    print(to_be_print+'\n')

### 定义top K运算

In [7]:
TF = {
    'n':lambda tf:tf,
    'l':lambda tf:[1+math.log(x) if x>=1 else 0 for x in tf],
    'a':lambda tf:[0.5+0.5*x/max(tf) if max(tf)>0 else 0 for x in tf],
    'b':lambda tf:[1 if x>0 else 0 for x in tf],
    # TODO 未实现
    'L':lambda tf:[(1+math.log(x))/(1+math.log(sum(tf)/len(tf))) for x in tf]
}

DF = {
    'n':lambda df:[1]*len(df),
    't':lambda df:[math.log(N/x) if x>=1 else 0 for x in df],
    'p':lambda df:[max(0,math.log((N-x)/x)) if x>=1 else 0 for x in df]
}

NORM = {
    'n':lambda tfdf:[1]*len(tfdf),
    'c':lambda tfdf:[1/math.sqrt(sum([w*w for w in w_list])) if sum(w_list)>0 else 0]*len(tfdf),
    
    # TODO 未实现
    'u':lambda tfdf:1,
    'b':lambda tfdf:1
}

def compute_wtq(terms, query_term_freq, notation='ltn'):
    tf = [query_term_freq[term] for term in terms]
    tf = TF[notation[0]](tf)
    df = [len(get_postings(term)) for term in terms]
    df = DF[notation[1]](df)
    tfdf = [tf[i]*df[i] for i in range(len(tf))]
    norm = NORM[notation[2]](tfdf)
    w = [tfdf[i] * norm[i] for i in range(len(tfdf))]
    return w

def compute_wtd(tf,df,notation='lnc'):
    tf = TF[notation[0]]([tf])[0]
    df = DF[notation[1]]([df])[0]
    return tf*df

def top_k(query,k,notation='lnc.ltn'):
    notation = notation.split('.')
    query_tokens = preprocess(query)
    term_freq = nltk.FreqDist(query_tokens)
    query_terms = list(term_freq.keys())
    score = [0]*N
    wtq = compute_wtq(query_terms, term_freq, notation[1])
    for i in range(len(query_terms)):
        postings = get_postings(query_terms[i])
        for posting in postings:
            # wtd未normalize，在循环外normalize
            wtd = compute_wtd(posting[1],len(postings),notation[0])
            score[posting[0]] += wtq[i]*wtd
    
    # document normalization 只实现n和c
    if notation[0][2] == 'c':
        score = [score[i]/length[i] for i in range(N)]
    
    # 修正k，只返回相关结果
    k_correct = len([x for x in score if x > 0])
    if k > k_correct:
        k = k_correct
        
    score = np.array(score)
    order = list(np.argsort(score))
    order.reverse()
    results = [tweets[i] for i in order[:k]]
    return results

In [8]:
def display_query_result(query, k):
    results = top_k(query,k)
    print("查询%s, 返回前%s条结果：\n"%(toRed(query), toRed(str(len(results)))))
    for result in results:
        print_with_emphasize(result,query)

# 显示查询交互

In [9]:
import tkinter as tk
from IPython.display import clear_output

In [10]:
win = tk.Tk()
win.title('查询（请输入关键词和k）')
win.geometry('700x100')

query = tk.StringVar()
entry1 = tk.Entry(win, relief='sunken',font='Calibri 23',width=33,textvariable=query)
entry1.place(relx=0.40,rely=0.5,anchor='center')
k = tk.IntVar(value=10)
entry2 = tk.Entry(win, relief='sunken',font='Calibri 23',width=3,textvariable=k)
entry2.place(relx=0.83,rely=0.5,anchor='center')

def button_command():
    clear_output()
    query_sentence = query.get()
    k_value = k.get()
    display_query_result(query_sentence, k_value)

button = tk.Button(win, text='查询', height=2,width=8,command=button_command)
button.place(relx=0.93,rely=0.5,anchor='center')

In [11]:
win.mainloop()

查询[31;2mmuscle pain from statins[0m, 返回前[31;2m15[0m条结果：

Is your [31;2mmuscle[0m [31;2mpain[0m related to cholesterol medication ? If so then a common over the counter supplement may help ... http : //t.co/AwCR0ryu 

Bloomberg : `` Merck ’ s Tredaptive Raises Risk of [31;2mMuscle[0m [31;2mPain[0m in Study '' - report of HPS2-THRIVE study in # EHJ today : http : //t.co/R5MdwpAMF6 

PHOTOS : [31;2mMuscle[0m cars raise $ 10000 for Queensland floods : The American [31;2mMuscle[0m Car Club of Australia recen ... http : //bit.ly/dOdSDo # muscles 

Diabetes Patients with Chest [31;2mPain[0m http : //t.co/oUEyCzEQP8 

@ DuncanBannatyne bbc cut backs are a [31;2mpain[0m ... 

[31;2mStatins[0m often prescribed without good evidence : Many doctors prescribe [31;2mstatins[0m to people who have little chanc ... http : //t.co/8Wx5Y9vYOU 

TV Ads for [31;2mStatins[0m Drive Overdiagnosis and Overtreatment http : //t.co/TbpgIZJv4E 

Mediterranean diet 'as good as [31;2mstatin