# TextCNN
1. 预训练词向量
2. TextCNN网络

In [92]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Content Preprocessing

In [93]:
import bz2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import jieba

## word vector

从词向量预训练模型中建立词到向量映射

In [94]:
with bz2.open('/content/drive/MyDrive/Colab Notebooks Project/NLP/TouTiao_text/sgns.weibo.word.bz2') as f:
    info, *content = [x.decode('utf-8') for x in f.readlines()]

In [95]:
word_to_vector = {}

for ls in content:
    word, *vector = ls.split()
    word_to_vector[word] = np.array(vector).astype(np.float64)

vector_dim = int(info.split()[1])
word_to_vector['UNK'] = np.random.random(size=vector_dim)
word_to_vector['PAD'] = np.random.random(size=vector_dim)

## training data

1. 对训练数据的词转化成对应的id，如果预训练没有的词则统一变成‘UNK’标识 
2. 对文本数据进行初步清洗

In [96]:
df_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks Project/NLP/TouTiao_text/train.csv')
df_data['sentence_cut'] = df_data['sentence'].apply(jieba.lcut)
df_data['label_for_train'] = pd.factorize(df_data['label'])[0]
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53360 entries, 0 to 53359
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               53360 non-null  int64 
 1   label            53360 non-null  int64 
 2   label_desc       53360 non-null  object
 3   sentence         53360 non-null  object
 4   sentence_cut     53360 non-null  object
 5   label_for_train  53360 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 2.4+ MB


In [97]:
word_to_id = {'UNK': 0, 'PAD': 1}
id_to_word = {0: 'UNK', 1: 'PAD'}
id_for_sentence = []

id = 2
for sentence in df_data['sentence_cut']:
    id_list = []
    for word in sentence:

        if word not in word_to_vector:
            id_list.append(word_to_id['UNK'])
        
        else:
            
            if word not in word_to_id:
                word_to_id[word] = id
                id_to_word[id] = word
                id += 1 

            id_list.append(word_to_id[word])

    id_for_sentence.append(id_list)

df_data['id_for_sentence'] = id_for_sentence

In [98]:
df_data['id_for_sentence']

0        [2, 3, 4, 0, 5, 6, 7, 8, 9, 4, 10, 11, 6, 12, ...
1        [0, 23, 24, 25, 26, 27, 28, 0, 29, 30, 31, 32,...
2        [40, 41, 30, 42, 11, 43, 6, 44, 45, 46, 11, 6,...
3                  [53, 32, 54, 55, 56, 57, 0, 58, 59, 22]
4               [60, 34, 61, 62, 6, 0, 63, 64, 65, 66, 67]
                               ...                        
53355    [405, 447, 290, 0, 15918, 966, 81, 966, 13517,...
53356        [1379, 4833, 158, 84, 36245, 40657, 34, 2967]
53357    [44491, 848, 44492, 44493, 1264, 70, 11301, 75...
53358    [116, 107, 590, 34, 485, 3955, 6, 9846, 899, 2...
53359    [9619, 31951, 30867, 391, 6, 5303, 4172, 292, ...
Name: id_for_sentence, Length: 53360, dtype: object

## embedding matrix

构建词向量的矩阵，矩阵对应行的行数就是词的向量

In [99]:
embedding_mat = np.zeros(shape=(len(word_to_id), vector_dim))

for i in range(len(embedding_mat)):
    embedding_mat[i] = word_to_vector[id_to_word[i]]

## padding & truncated

让每个训练样本的序列长度一样

In [100]:
max_length = 20

df_data['sentence_for_train'] = df_data['id_for_sentence'].apply(
    lambda x: x+[1]*(max_length-len(x)) if max_length > len(x) else x[:max_length])

# TextCNN Network

## backbone

In [245]:
import torch.nn as nn
import torch

class TextCNN(nn.Module):
    def __init__(self, seq_length, emb_size, emb_dim, n_filter, filter_height, n_class):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(emb_size, emb_dim)
        self.conv = nn.Conv2d(1, n_filter, (filter_height, emb_dim))
        self.pool = nn.MaxPool1d(seq_length-filter_height+1)
        self.fc = nn.Linear(n_filter, n_class)
        self.softmax = nn.Softmax(dim=1)


    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        x = self.conv(x).squeeze(3)
        x = self.pool(x).squeeze(2)
        x = self.fc(x)
        out = self.softmax(x)
        return out

model = TextCNN(10, 20, 7, 3, 4, 3)
seq = torch.LongTensor(np.random.randint(0, 20, (12,10)))
model(seq)

tensor([[0.2797, 0.5482, 0.1720],
        [0.4052, 0.4048, 0.1900],
        [0.4263, 0.4593, 0.1143],
        [0.2444, 0.6139, 0.1418],
        [0.3495, 0.5728, 0.0777],
        [0.3189, 0.5653, 0.1158],
        [0.4794, 0.3682, 0.1525],
        [0.5056, 0.4129, 0.0815],
        [0.4531, 0.4562, 0.0907],
        [0.5102, 0.3759, 0.1139],
        [0.5259, 0.3003, 0.1738],
        [0.2888, 0.5922, 0.1190]], grad_fn=<SoftmaxBackward>)

## trainer

In [None]:
import pytorch_lightning as pl
import torch.nn as nn
import torch

class TrainModel(pl.LightningModule):
    def __init__(self, ):
        super(TrainModel, self).__init__()
        self.backbone = 
        self.loss = 
	self.lr = 

    def training_step(self, batch, batch_idx):
        x, y = batch
        output = self.backbone(x)
        loss = self.loss(output, y)
        self.log('Training Loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        output = self.backbone(x)
        loss = self.loss(output, y)
        self.log('Validation Loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

    def forward(self, x):
		predict = self.backbone(x)
		return predict

## dataset

In [231]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split


x_tensor = torch.LongTensor(np.random.randint(0, 100, (500,8)))
y_tensor = torch.LongTensor(np.random.randint(0, 3, (500)))

x_train, x_valid, y_train, y_valid = train_test_split(x_tensor, y_tensor)

train_loader = DataLoader(TensorDataset(x_train, y_train), batch_size=128, shuffle=True, num_workers=2)
valid_loader = DataLoader(TensorDataset(x_valid, y_valid), batch_size=128, shuffle=True, num_workers=2)

# Training

In [None]:
model = TrainModel()
trainer = pl.Trainer(max_epochs=50, logger=tb_logger, gpus=1, callbacks=[checkpoint_callback])
trainer.fit(model, train_loader, valid_loader)