In [1]:
import os
import re
import pandas as pd
import json
import jieba

In [2]:
def is_Chinese(uchar):
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

In [3]:
#处理中文的方法
#包括：句子切分、去标点、去停词、
class preprocess_chinese:
    def __init__(self):
        self.record = []
        self.stopwords = []
        
    #导入停用词
    def load_stopwords(self, filename = '../停用词表_w2v/停用词库.txt'):
        f_in = open(filename, 'r', encoding = 'utf-8')
        for line in f_in:
            self.stopwords.append(line.strip())
        f_in.close()
        
    #移除非中文词
    #输入: 中文句子;
    #输出: 中文句子。
    def remove_nonchinese(self, sentence):
        precessed_sentence = ''
        for char in sentence:
            if is_Chinese(char):
                precessed_sentence += char
        return precessed_sentence        
    
    #从一个句子里面去除停用词
    #输入：句子
    #输出：去除停用词之后的 list
    def remove_stopwords(self, sentence):
        if len(self.stopwords) == 0:
            self.load_stopwords()        
        segmentation = jieba.lcut(sentence)
        removed_sw_list = [word for word in segmentation if not word in self.stopwords]        
        return removed_sw_list
    
    #对一个爬下来的句子进行处理
    #split: 正则表达式（方括号表达式，用于句子切分）
    #处理步骤：句子切分; 保留中文词; 分词并去停词。 最后会返回一个分完词的 list
    #输入: 一个句子
    #输出: list(list), list(list); 切分好的句子 以及 切分并处理好的句子
    def process_one_sentence(self, ori_sen):
        sentence = self.remove_nonchinese(ori_sen)
        pre_sentence = self.remove_stopwords(sentence)
        return pre_sentence
    

In [6]:
class create_nontokenize_txt():
    def __init__(self):
        self.product_ids = None
        self.df_ids = None
        self.df_list = None
    
    def _get_ids(self):
        product_ids = []
        product_ids_path = '../爬虫及其结果/笔记本id.txt'
        with open(product_ids_path, 'r') as f:
            for line in f:
                product_ids += line.split(', ')[:5]
        self.product_ids = product_ids
        return product_ids
    
    def _read_data(self):
        df_list = []
        file_path = '../爬虫及其结果'
        df_ids = []
        for p_id in self.product_ids:
            try:
                result_path = os.path.join(file_path, p_id + '.xlsx')
                p_df = pd.read_excel(result_path)
                df_list.append(p_df)
                df_ids.append(p_id)
            except:
                continue
        df = pd.concat(df_list, keys = df_ids)
        self.df_list = df_list
        self.df_ids = df_ids
        return df
    
    def create_csv(self, path = 'ok.csv'):
        df = pd.concat(self.df_list, keys = self.df_ids)
        df.to_csv(path)
    
    def create_txt(self, path = 'train_total.txt'):
        if self.product_ids is None:
            self._get_ids()
        if self.df_ids is None or self.df_list is None:
            self._read_data()
        report_dict = {}
        for i in range(len(self.df_list)):            
            for content in self.df_list[i]['内容']:
                ori_sen = content    
                report_dict['商品id'] = self.df_ids[i]
                preprocessor = preprocess_chinese()
                pre_sentence = preprocessor.process_one_sentence(ori_sen)      
                report_dict['原文本'] = ori_sen
                report_dict['处理后的文本'] = pre_sentence
                with open(path, 'a', encoding = 'utf-8') as f:
                    json.dump(report_dict, f, ensure_ascii=False)
                    f.write('\n')
            print(f'{i}/{len(self.df_list)}\r')
        

In [7]:
txt_creator = create_nontokenize_txt()
txt_creator.create_txt()

0/145
1/145
2/145
3/145
4/145
5/145
6/145
7/145
8/145
9/145
10/145
11/145
12/145
13/145
14/145
15/145
16/145
17/145
18/145
19/145
20/145
21/145
22/145
23/145
24/145
25/145
26/145
27/145
28/145
29/145
30/145
31/145
32/145
33/145
34/145
35/145
36/145
37/145
38/145
39/145
40/145
41/145
42/145
43/145
44/145
45/145
46/145
47/145
48/145
49/145
50/145
51/145
52/145
53/145
54/145
55/145
56/145
57/145
58/145
59/145
60/145
61/145
62/145
63/145
64/145
65/145
66/145
67/145
68/145
69/145
70/145
71/145
72/145
73/145
74/145
75/145
76/145
77/145
78/145
79/145
80/145
81/145
82/145
83/145
84/145
85/145
86/145
87/145
88/145
89/145
90/145
91/145
92/145
93/145
94/145
95/145
96/145
97/145
98/145
99/145
100/145
101/145
102/145
103/145
104/145
105/145
106/145
107/145
108/145
109/145
110/145
111/145
112/145
113/145
114/145
115/145
116/145
117/145
118/145
119/145
120/145
121/145
122/145
123/145
124/145
125/145
126/145
127/145
128/145
129/145
130/145
131/145
132/145
133/145
134/145
135/145
136/145
137/145
138/14