In [1]:
import json
import nltk

### 将文本加载为json格式

In [2]:
input_file = 'tweets.txt'
with open(input_file, 'r') as f:
    lines = f.readlines()
items = [json.loads(x) for x in lines]
tweets = [x['text'] for x in items]

### 定义预处理类
* 大写转小写
* 分词
* 去除标点符号和停用词

In [3]:
class Preprocess:
    def __init__(self):
        self.punctuations = [',',':','_','!','\"','*','>','<','@','~','-','(',')','%','=','\\','^','&','|','#','$','[',']','+',':','#','|'] 
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
    def __call__(self,text,query=False):
        text = text.lower()
        text = nltk.word_tokenize(text)
        text = [x for x in text if x not in self.punctuations and x not in self.stop_words]
        return text

In [4]:
preprocess = Preprocess()
tokens = [preprocess(x) for x in tweets]

### 统计词频

In [9]:
dictionary = {}
# posting 包括文档编号和tf
for i, token in enumerate(tokens):
    term_freq = nltk.FreqDist(token)
    for term, freq in term_freq.items():
        if term in dictionary:
            dictionary[term].add((i,freq))
        else:
            dictionary[term] = {(i,freq)}
# 按tf由大到小排序
for k,v in dictionary.items():
    dictionary[k] = sorted(list(dictionary[k]),key=lambda x:x[1],reverse=True)

In [15]:
dictionary['home']

[(17517, 2),
 (17912, 2),
 (6802, 2),
 (11340, 2),
 (17628, 2),
 (9523, 2),
 (17533, 2),
 (16209, 2),
 (11341, 2),
 (5933, 1),
 (16222, 1),
 (12812, 1),
 (5513, 1),
 (7610, 1),
 (14348, 1),
 (15208, 1),
 (20492, 1),
 (23078, 1),
 (27473, 1),
 (18050, 1),
 (2575, 1),
 (17390, 1),
 (2585, 1),
 (16458, 1),
 (16855, 1),
 (24676, 1),
 (5627, 1),
 (27016, 1),
 (28040, 1),
 (22772, 1),
 (4209, 1),
 (6257, 1),
 (27042, 1),
 (16208, 1),
 (15673, 1),
 (25624, 1),
 (17540, 1),
 (18482, 1),
 (4254, 1),
 (27, 1),
 (3099, 1),
 (15518, 1),
 (17891, 1),
 (6932, 1),
 (18065, 1),
 (19650, 1),
 (7339, 1),
 (24747, 1),
 (7562, 1),
 (27448, 1),
 (10355, 1),
 (19059, 1),
 (25889, 1),
 (4030, 1),
 (15806, 1),
 (4972, 1),
 (4329, 1),
 (2586, 1),
 (4841, 1),
 (17129, 1),
 (9557, 1),
 (15875, 1),
 (13390, 1),
 (15868, 1),
 (21674, 1),
 (16869, 1),
 (17919, 1),
 (17581, 1),
 (3468, 1),
 (4807, 1),
 (7879, 1),
 (27345, 1),
 (5519, 1),
 (5578, 1),
 (16842, 1),
 (27594, 1),
 (17918, 1),
 (25086, 1),
 (17557, 1),
 (

### 定义postings merge运算

In [None]:
def union_list(l1, l2):
    result = []
    i = 0
    j = 0
    while (i < len(l1) and j < len(l2)):
        if l1[i] == l2[j]:
            result.append(l1[i])
            i += 1
            j += 1
        elif l1[i] < l2[j]:
            result.append(l1[i])
            i += 1
        else:
            result.append(l2[j])
            j += 1
    for x in l1[i:]:
        result.append(x)
    for x in l2[j:]:
        result.append(x)
    return result

def intersection_list(l1, l2):
    result = []
    i = 0
    j = 0
    while (i < len(l1) and j < len(l2)):
        if l1[i] == l2[j]:
            result.append(l1[i])
            i += 1
            j += 1
        elif l1[i] < l2[j]:
            i += 1
        else:
            j += 1
    return result

def difference_list(l1,l2):
    result = []
    i = 0
    j = 0
    while (i < len(l1) and j < len(l2)):
        if l1[i] == l2[j]:
            i += 1
            j += 1
        elif l1[i] < l2[j]:
            result.append(l1[i])
            i += 1
        else:
            j += 1
    for x in l1[i:]:
        result.append(x)
    return result

### 定义查询运算

In [None]:
ops_rule = {
    '+': 1,
    '*': 2,
    '-': 3,
} ##优先级定义


# 中缀表达式转后缀表达式 https://blog.csdn.net/antineutrino/article/details/6763722
def middle_to_after(ss):
    expression = []
    ops = []
    for item in ss:
        # 遇到运算符时，比较其与S1栈顶运算符的优先级
        if item in ['+', '*', '-']:
            while len(ops) >= 0:
                if len(ops) == 0:
                    ops.append(item)
                    break
                if ops[-1] == '(' or ops_rule[item] > ops_rule[ops[-1]]:
                    ops.append(item)
                    break
                else:
                    expression.append(ops.pop())
                    
        #  如果是左括号“(”，则直接压入S1
        elif item == '(':
            ops.append(item)
            
        # 如果是右括号“)”，则依次弹出S1栈顶的运算符，并压入S2，直到遇到左括号为止，此时将这一对括号丢弃
        elif item == ')':
            while len(ops) > 0:
                op = ops.pop()
                if op == '(':
                    break
                else:
                    expression.append(op)
        
        # 遇到操作数时，将其压入S2
        else:
            expression.append(item)

    while len(ops) > 0:
        expression.append(ops.pop())

    return expression

def expression_to_value(expression):
    stack_value = []
    for item in expression:
        if item in ['+', '*', '-']:
            n2 = stack_value.pop()
            n1 = stack_value.pop()
            result = cal(n1, n2, item)
            stack_value.append(result)
        else:
            stack_value.append(item)
    return stack_value[0]
 
def cal(n1, n2, op):
    if op == '+':
        return union_list(n1,n2)
    if op == '*':
        return intersection_list(n1,n2)
    if op == '-':
        return difference_list(n1,n2)

def query_parse(query):
    tokens = query.split()
    items = []
    i = 0
    for token in tokens:
        if token == 'and':
            items.append('*')
        elif token == 'or':
            items.append('+')
        elif token == 'not':
            items.append('-')
        elif token == '(':
            items.append('(')
        elif token == ')':
            items.append(')')
        else:
            if token in dictionary:
                items.append(dictionary[token])
            else:
                items.append([])
    return items

# 带红色强调字体输出
def toRed( s ):
    return "%c[31;2m%s%c[0m"%('\033', s, '\033')
def print_with_emphasize(line, Q):
    Q = Q.replace('and','')
    Q = Q.replace('and','')
    Q = Q.replace('or','')
    Q = Q.replace('(','')
    Q = Q.replace(')','')
    #只剩关键词语
    line = line.split()
    s_line = Q.split()
    to_be_print = ''
    for l in line:
        if l.lower() in s_line:
            to_be_print += toRed(l) + ' '
        else:
            to_be_print += l + ' '
    print(to_be_print+'\n')

def display_query_result(query):
    temp = query
    query = query.lower().replace(' (', ' ( ').replace(') ', ' ) ')
    results = [tweets[x] for x in expression_to_value(middle_to_after(query_parse(query)))]
    print("查询%s, 得到%d条结果：\n"%(toRed(temp), len(results)))
    for result in results:
        print_with_emphasize(result,query)

# 显示查询交互

In [None]:
import tkinter as tk
from IPython.display import clear_output

In [None]:
win = tk.Tk()
win.title('查询')
win.geometry('700x100')

query = tk.StringVar()
entry = tk.Entry(win, relief='sunken',font='Calibri 23',width=33,textvariable=query)
entry.place(relx=0.43,rely=0.5,anchor='center')

def button_command():
    clear_output()
    query_sentence = query.get()
    display_query_result(query_sentence)

button = tk.Button(win, text='查询', height=2,width=8,command=button_command)
button.place(relx=0.9,rely=0.5,anchor='center')

In [None]:
win.mainloop()