In [6]:
import re
import jieba
import pandas as pd
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QLabel, QLineEdit, QPushButton, QFileDialog
from flask import Flask, render_template_string, request, jsonify

app = Flask(__name__)
app.config['SECRET_KEY'] = 'secret_key'


def preprocess_text(text):
    # 去除标点符号、空格、数字、英文字母
    text = re.sub(r'[^\u4e00-\u9fa5]', '', text)
    # 限制字符长度为15
    text = text[:20]
    return text


# 定义分词函数
def tokenize(text):
    words = jieba.lcut(text)
    return words


def process_data(raw_data_file, user_dict_file):
    # 加载自定义分词字典
    jieba.load_userdict(user_dict_file)

    # 读取原始数据文件
    data = pd.read_csv(raw_data_file)

    result_set = []
    items = data[['商品名称', '商品链接', '商品销量']][:2000]
    # 数据预处理：去除标点符号、空格、数字、英文字母，并限制字符长度
    items['商品简称'] = items['商品名称'].apply(lambda x: preprocess_text(x))

    # 分词并聚合数据
    word_counts = {}
    word_sales = {}

    for index, row in items.iterrows():
        product_name = row['商品简称']
        sales = row['商品销量']
        words = tokenize(product_name)

        for word in words:
            if word in word_counts:
                word_counts[word] += 1
                word_sales[word] += sales
            else:
                word_counts[word] = 1
                word_sales[word] = sales

    # 构建结果表
    result = pd.DataFrame({'词': list(word_counts.keys()),
                           '出现次数': list(word_counts.values()),
                           '商品销量': list(word_sales.values())})
    result['平均销量'] = result['商品销量'] / result['出现次数']
    result_set.append(result)

    result = pd.concat(result_set)

    result.to_csv("result.csv", encoding="gb18030")
    return result


class MainWindow(QWidget):
    def __init__(self):
        super().__init__()

        self.raw_data_file = ""
        self.user_dict_file = ""

        self.layout = QVBoxLayout()
        self.label_raw_data = QLabel("原始数据文件：")
        self.text_raw_data = QLineEdit()
        self.button_raw_data = QPushButton("选择文件")
        self.button_raw_data.clicked.connect(self.select_raw_data_file)

        self.label_user_dict = QLabel("自定义分词文件：")
        self.text_user_dict = QLineEdit()
        self.button_user_dict = QPushButton("选择文件")
        self.button_user_dict.clicked.connect(self.select_user_dict_file)

        self.button_tokenize = QPushButton("分词")
        self.button_tokenize.clicked.connect(self.tokenize_data)

        self.layout.addWidget(self.label_raw_data)
        self.layout.addWidget(self.text_raw_data)
        self.layout.addWidget(self.button_raw_data)
        self.layout.addWidget(self.label_user_dict)
        self.layout.addWidget(self.text_user_dict)
        self.layout.addWidget(self.button_user_dict)
        self.layout.addWidget(self.button_tokenize)

        self.setLayout(self.layout)

    def select_raw_data_file(self):
        file_dialog = QFileDialog()
        file_dialog.setFileMode(QFileDialog.ExistingFile)
        file_dialog.setNameFilter("CSV Files (*.csv)")
        if file_dialog.exec_():
            filenames = file_dialog.selectedFiles()
            self.raw_data_file = filenames[0]
            self.text_raw_data.setText(self.raw_data_file)

    def select_user_dict_file(self):
        file_dialog = QFileDialog()
        file_dialog.setFileMode(QFileDialog.ExistingFile)
        file_dialog.setNameFilter("Text Files (*.txt)")
        if file_dialog.exec_():
            filenames = file_dialog.selectedFiles()
            self.user_dict_file = filenames[0]
            self.text_user_dict.setText(self.user_dict_file)

    def tokenize_data(self):
        if self.raw_data_file != "" and self.user_dict_file != "":
            result = process_data(self.raw_data_file, self.user_dict_file)
            self.show_result(result)

    def show_result(self, result):
        html = result.to_html()
        render_template = '''
        <html>
        <head>
            <style>
                table {
                    border-collapse: collapse;
                    width: 100%;
                }
                th, td {
                    text-align: left;
                    padding: 8px;
                    border-bottom: 1px solid #ddd;
                }
                th {
                    background-color: #f2f2f2;
                }
            </style>
        </head>
        <body>
            %s
        </body>
        </html>
        '''
        rendered = render_template_string(render_template % html)
        with app.test_request_context('/'):
            response = app.full_dispatch_request()
        response.set_data(rendered)
        response.headers['Content-Type'] = 'text/html'
        response.headers['Content-Length'] = len(response.get_data())
        response.headers['Access-Control-Allow-Origin'] = '*'
        response.headers['Access-Control-Allow-Methods'] = 'GET'
        response.headers['Access-Control-Allow-Headers'] = 'Content-Type'
        response.headers['Access-Control-Max-Age'] = '86400'
        response.headers['Access-Control-Allow-Credentials'] = 'true'
        response.headers['Access-Control-Expose-Headers'] = 'Content-Length'
        response.headers['Access-Control-Allow-Headers'] = 'Range'
        response.headers['Accept-Ranges'] = 'bytes'
        response.headers['Content-Range'] = 'bytes 0-'
        response.headers['Content-Disposition'] = 'inline'
        return response


@app.route('/')
def index():
    return '''
    <h1>分词工具</h1>
    <p>请选择原始数据文件和自定义分词文件，然后点击“分词”按钮进行分词。</p>
    <p>分词结果将显示在下方的表格中。</p>
    '''


@app.route('/process', methods=['POST'])
def process():
    raw_data_file = request.form['raw_data_file']
    user_dict_file = request.form['user_dict_file']
    result = process_data(raw_data_file, user_dict_file)
    return jsonify(result.to_dict())


if __name__ == '__main__':
    window = QApplication([])
    main_window = MainWindow()
    main_window.show()

    app.run()


ModuleNotFoundError: No module named 'PyQt5'

In [10]:
!conda install PyQt5

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.

PackagesNotFoundError: The following packages are not available from current channels:

  - pyqt5

Current channels:

  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/osx-arm64
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/noarch
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/osx-arm64
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/noarch
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/msys2/osx-arm64
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/msys2/noarch
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/osx-arm64
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/

In [None]:
import re
import jieba
import pandas as pd
import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox

def preprocess_text(text):
    # 去除标点符号、空格、数字、英文字母
    text = re.sub(r'[^\u4e00-\u9fa5]', '', text)
    # 限制字符长度为15
    text = text[:20]
    return text

# 定义分词函数
def tokenize(text):
    words = jieba.lcut(text)
    return words

def process_data(raw_data_file, user_dict_file):
    # 加载自定义分词字典
    jieba.load_userdict(user_dict_file)

    # 读取原始数据文件
    data = pd.read_csv(raw_data_file)

    result_set = []
    items = data[['商品名称', '商品链接', '商品销量']][:2000]
    # 数据预处理：去除标点符号、空格、数字、英文字母，并限制字符长度
    items['商品简称'] = items['商品名称'].apply(lambda x: preprocess_text(x))

    # 分词并聚合数据
    word_counts = {}
    word_sales = {}

    for index, row in items.iterrows():
        product_name = row['商品简称']
        sales = row['商品销量']
        words = tokenize(product_name)

        for word in words:
            if word in word_counts:
                word_counts[word] += 1
                word_sales[word] += sales
            else:
                word_counts[word] = 1
                word_sales[word] = sales

    # 构建结果表
    result = pd.DataFrame({'词': list(word_counts.keys()),
                           '出现次数': list(word_counts.values()),
                           '商品销量': list(word_sales.values())})
    result['平均销量'] = result['商品销量'] / result['出现次数']
    result_set.append(result)

    result = pd.concat(result_set)

    result.to_csv("result.csv", encoding="gb18030")
    return result

def select_raw_data_file():
    raw_data_file = filedialog.askopenfilename(filetypes=[('CSV Files', '*.csv')])
    entry_raw_data.delete(0, tk.END)
    entry_raw_data.insert(0, raw_data_file)

def select_user_dict_file():
    user_dict_file = filedialog.askopenfilename(filetypes=[('Text Files', '*.txt')])
    entry_user_dict.delete(0, tk.END)
    entry_user_dict.insert(0, user_dict_file)

def tokenize_data():
    raw_data_file = entry_raw_data.get()
    user_dict_file = entry_user_dict.get()
    
    if raw_data_file != "" and user_dict_file != "":
        try:
            result = process_data(raw_data_file, user_dict_file)
            messagebox.showinfo("分词结果", "分词完成！结果已保存到result.csv文件。")
        except Exception as e:
            messagebox.showerror("错误", str(e))
    else:
        messagebox.showwarning("警告", "请先选择原始数据文件和自定义分词文件。")

# 创建主窗口
window = tk.Tk()
window.title("分词工具")

# 创建文件选择框和按钮
label_raw_data = tk.Label(window, text="原始数据文件：")
label_raw_data.grid(row=0, column=0, padx=10, pady=5, sticky=tk.W)
entry_raw_data = tk.Entry(window, width=50)
entry_raw_data.grid(row=0, column=1, padx=10, pady=5)
button_select_raw_data = tk.Button(window, text="选择文件", command=select_raw_data_file)
button_select_raw_data.grid(row=0, column=2, padx=10, pady=5)

label_user_dict = tk.Label(window, text="自定义分词文件：")
label_user_dict.grid(row=1, column=0, padx=10, pady=5, sticky=tk.W)
entry_user_dict = tk.Entry(window, width=50)
entry_user_dict.grid(row=1, column=1, padx=10, pady=5)
button_select_user_dict = tk.Button(window, text="选择文件", command=select_user_dict_file)
button_select_user_dict.grid(row=1, column=2, padx=10, pady=5)

button_tokenize = tk.Button(window, text="分词", command=tokenize_data)
button_tokenize.grid(row=2, column=1, padx=10, pady=10)

window.mainloop()


In [2]:
!pyinstaller main.py --onefile --noconsole

433 INFO: PyInstaller: 5.13.0
433 INFO: Python: 3.9.15 (conda)
440 INFO: Platform: macOS-13.2.1-arm64-arm-64bit
441 INFO: wrote /Users/suntian/GitHub/huitun/wordseg_tool/main.spec
444 INFO: Extending PYTHONPATH with paths
['/Users/suntian/GitHub/huitun/wordseg_tool']
773 INFO: checking Analysis
829 INFO: Building because hiddenimports changed
829 INFO: Initializing module dependency graph...
830 INFO: Caching module graph hooks...
836 INFO: Analyzing base_library.zip ...
1919 INFO: Loading module hook 'hook-encodings.py' from '/Users/suntian/opt/anaconda3/envs/paddle/lib/python3.9/site-packages/PyInstaller/hooks'...
2703 INFO: Loading module hook 'hook-pickle.py' from '/Users/suntian/opt/anaconda3/envs/paddle/lib/python3.9/site-packages/PyInstaller/hooks'...
3090 INFO: Loading module hook 'hook-heapq.py' from '/Users/suntian/opt/anaconda3/envs/paddle/lib/python3.9/site-packages/PyInstaller/hooks'...
3357 INFO: Caching module dependency graph...
3400 INFO: running Analysis Analysis-00.t