<a href="https://colab.research.google.com/github/ShinerayLu/NN/blob/main/fasttext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 文本分类项目


1. fasttest

https://github.com/facebookresearch/fastText

2. BERT

https://github.com/Jiakui/awesome-bert

3. GPT-2

https://github.com/Morizeyao/GPT2-Chinese

# 第一步：安装fasttext

> git clone https://github.com/facebookresearch/fastText.git

> cd fastText

> pip install .

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 14.4MB/s eta 0:00:01[K     |█████████▌                      | 20kB 1.7MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 2.4MB/s eta 0:00:01[K     |███████████████████             | 40kB 3.0MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 2.4MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.0MB/s 
Building wheels for collected packages: fasttext
y
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3025794 sha256=483a721e66c97031b737c462c02aa2329ff40f9522edc31b37217ea0f4784448
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c1

# 第二步：准备数据集

In [2]:
#  标签

mapper_tag = {
    '财经': 'Finance',
    '彩票': 'Lottery',
    '房产': 'Property',
    '股票': 'Shares',
    '家居': 'Furnishing',
    '教育': 'Education',
    '科技': 'Technology',
    '社会': 'Sociology',
    '时尚': 'Fashion',
    '时政': 'Affairs',
    '体育': 'Sports',
    '星座': 'Constellation',
    '游戏': 'Game',
    '娱乐': 'Entertainment'
}

# 第三步：数据预处理

In [9]:
# 把所有需要调用的库都写在前面

import re
from types import MethodType, FunctionType
import jieba
import numpy as np
import pandas as pd
from random import shuffle
import torch
import os

import fasttext.FastText as fasttext


# 数据集路径写一个全局变量

PATH = '/content/drive/My Drive/NLP第一次课/01_fasttext/datasets/'

In [10]:
# 数据清洗

def clean_txt(raw):
    fil = re.compile(r"[^0-9a-zA-Z\u4e00-\u9fa5]+")
    return fil.sub(' ', raw)

def seg(sentence, sw, apply=None):
    if isinstance(apply, FunctionType) or isinstance(apply, MethodType):
        sentence = apply(sentence)
    return ' '.join([i for i in jieba.cut(sentence) if i.strip() and i not in sw])

def stop_words():
    with open(PATH+'stopwords.txt', 'r', encoding='utf-8') as swf:
        return [line.strip() for line in swf]

    
# 对某个sentence进行处理：
content = '上海天然橡胶期价周三再创年内新高，主力合约突破21000元/吨重要关口。'
res = seg(content.lower().replace('\n', ''), stop_words(), apply=clean_txt)
res

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.987 seconds.
Prefix dict has been built successfully.


'上海 天然橡胶 期价 周三 再创 年内 新高 主力 合约 突破 21000 元 吨 关口'

In [11]:
# 切割数据
# 先将txt文件转换成csv文件，方便后面的计算

class _MD(object):
    mapper = {
        str: '',
        int: 0,
        list: list,
        dict: dict,
        set: set,
        bool: False,
        float: .0
    }

    def __init__(self, obj, default=None):
        self.dict = {}
        assert obj in self.mapper, \
            'got a error type'
        self.t = obj
        if default is None:
            return
        assert isinstance(default, obj), \
            f'default ({default}) must be {obj}'
        self.v = default

    def __setitem__(self, key, value):
        self.dict[key] = value


    def __getitem__(self, item):
        if item not in self.dict and hasattr(self, 'v'):
            self.dict[item] = self.v
            return self.v
        elif item not in self.dict:
            if callable(self.mapper[self.t]):
                self.dict[item] = self.mapper[self.t]()
            else:
                self.dict[item] = self.mapper[self.t]
            return self.dict[item]
        return self.dict[item]


def defaultdict(obj, default=None):
    return _MD(obj, default)


class TransformData(object):
    def to_csv(self, handler, output, index=False):
        dd = defaultdict(list)
        for line in handler:
            label, content = line.split(',', 1)
            dd[label.strip('__label__').strip()].append(content.strip())

        df = pd.DataFrame()
        for key in dd.dict:
            col = pd.Series(dd[key], name=key)
            df = pd.concat([df, col], axis=1)
        return df.to_csv(output, index=index, encoding='utf-8')


def split_train_test(source, auth_data=False):
    if not auth_data:
        train_proportion = 0.8
    else:
        train_proportion = 0.98

    basename = source.rsplit('.', 1)[0]
    train_file = basename + '_train.txt'
    test_file = basename + '_test.txt'

    handel = pd.read_csv(source, index_col=False, low_memory=False)
    train_data_set = []
    test_data_set = []
    for head in list(handel.head()):
        train_num = int(handel[head].dropna().__len__() * train_proportion)
        sub_list = [f'__label__{head} , {item.strip()}\n' for item in handel[head].dropna().tolist()]
        train_data_set.extend(sub_list[:train_num])
        test_data_set.extend(sub_list[train_num:])
    shuffle(train_data_set)
    shuffle(test_data_set)

    with open(train_file, 'w', encoding='utf-8') as trainf,\
        open(test_file, 'w', encoding='utf-8') as testf:
        for tds in train_data_set:
            trainf.write(tds)
        for i in test_data_set:
            testf.write(i)

    return train_file, test_file

In [12]:
# 转化成csv
td = TransformData()
handler = open(PATH+'data.txt')
td.to_csv(handler, PATH+'data.csv')
handler.close()

# 将csv文件切割，会生成两个文件（data_train.txt和data_test.txt）
train_file, test_file = split_train_test(PATH+'data.csv', auth_data=True)

# 第四步：训练模型

In [13]:
def train_model(ipt=None, opt=None, model='', dim=100, epoch=5, lr=0.1, loss='softmax'):
    np.set_printoptions(suppress=True)
    if os.path.isfile(model):
        classifier = fasttext.load_model(model)
    else:
        classifier = fasttext.train_supervised(ipt, label='__label__', dim=dim, epoch=epoch,
                                         lr=lr, wordNgrams=2, loss=loss)
        """
          训练一个监督模型, 返回一个模型对象

          @param input:           训练数据文件路径
          @param lr:              学习率
          @param dim:             向量维度
          @param ws:              cbow模型时使用
          @param epoch:           次数
          @param minCount:        词频阈值, 小于该值在初始化时会过滤掉
          @param minCountLabel:   类别阈值，类别小于该值初始化时会过滤掉
          @param minn:            构造subword时最小char个数
          @param maxn:            构造subword时最大char个数
          @param neg:             负采样
          @param wordNgrams:      n-gram个数
          @param loss:            损失函数类型, softmax, ns: 负采样, hs: 分层softmax
          @param bucket:          词扩充大小, [A, B]: A语料中包含的词向量, B不在语料中的词向量
          @param thread:          线程个数, 每个线程处理输入数据的一段, 0号线程负责loss输出
          @param lrUpdateRate:    学习率更新
          @param t:               负采样阈值
          @param label:           类别前缀
          @param verbose:         ??
          @param pretrainedVectors: 预训练的词向量文件路径, 如果word出现在文件夹中初始化不再随机
          @return model object
        """
        classifier.save_model(opt)
    return classifier


In [15]:
# 调整以下参数观察分类结果

dim = 200
lr = 5
epoch = 5

model = f'/content/drive/My Drive/NLP第一次课/01_fasttext/data/data_test_dim{str(dim)}_lr0{str(lr)}_iter{str(epoch)}.model'

classifier = train_model(ipt=PATH+'data_train.txt',
                         opt=model,
                         model=model,
                         dim=dim, epoch=epoch, lr=0.5
                         )

result = classifier.test(PATH+'data_test.txt')
print(result)

# 整体的结果为(测试数据量，precision，recall)：

(989, 0.9787664307381193, 0.9787664307381193)


In [16]:
def cal_precision_and_recall(file=PATH+'data_test.txt'):
    precision = defaultdict(int, 1)
    recall = defaultdict(int, 1)
    total = defaultdict(int, 1)
    with open(file) as f:
        for line in f:
            label, content = line.split(',', 1)
            total[label.strip().strip('__label__')] += 1
            labels2 = classifier.predict([seg(sentence=content.strip(), sw='', apply=clean_txt)])
            pre_label, sim = labels2[0][0][0], labels2[1][0][0]
            recall[pre_label.strip().strip('__label__')] += 1

            if label.strip() == pre_label.strip():
                precision[label.strip().strip('__label__')] += 1

    print('precision', precision.dict)
    print('recall', recall.dict)
    print('total', total.dict)
    for sub in precision.dict:
        pre = precision[sub] / total[sub]
        rec =  precision[sub] / recall[sub]
        F1 = (2 * pre * rec) / (pre + rec)
        print(f"{sub.strip('__label__')}  precision: {str(pre)}  recall: {str(rec)}  F1: {str(F1)}")

In [19]:
def main(source):
    basename = source.rsplit('.', 1)[0]
    csv_file = basename + '.csv'

    td = TransformData()
    handler = open(source)
    td.to_csv(handler, csv_file)
    handler.close()

    train_file, test_file = split_train_test(csv_file)

    dim = 100
    lr = 5
    epoch = 5
    model = f'/content/drive/My Drive/NLP第一次课/01_fasttext/data/data_dim{str(dim)}_lr0{str(lr)}_iter{str(epoch)}.model'

    classifier = train_model(ipt=train_file,
                             opt=model,
                             model=model,
                             dim=dim, epoch=epoch, lr=0.5
                             )

    result = classifier.test(test_file)
    print(result)

    cal_precision_and_recall(test_file)

In [20]:
if __name__ == '__main__':
    main(PATH+'data.txt')

(9885, 0.9742033383915023, 0.9742033383915023)
precision {'Education': 997, 'Technology': 998, 'Entertainment': 1001, 'Furnishing': 997, 'Sports': 996, 'Shares': 999, 'Financ': 998, 'Gam': 867, 'Sociology': 987, 'Affairs': 1000}
recall {'Education': 1014, 'Technology': 999, 'Entertainment': 1006, 'Furnishing': 1003, 'Sports': 997, 'Shares': 1002, 'Financ': 1001, 'Gam': 870, 'Sociology': 1001, 'Affairs': 1002}
total {'Education': 1001, 'Technology': 1001, 'Entertainment': 1001, 'Furnishing': 1001, 'Sports': 1001, 'Shares': 1001, 'Financ': 1001, 'Gam': 876, 'Sociology': 1001, 'Affairs': 1001, 'Property': 11}
Education  precision: 0.996003996003996  recall: 0.9832347140039448  F1: 0.9895781637717121
Technology  precision: 0.997002997002997  recall: 0.998998998998999  F1: 0.998
Entertainment  precision: 1.0  recall: 0.9950298210735586  F1: 0.9975087194818136
Furnishing  precision: 0.996003996003996  recall: 0.9940179461615155  F1: 0.9950099800399202
Sports  precision: 0.995004995004995  re