# TextCNN
1. 预训练词向量
2. TextCNN网络

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Content Preprocessing

In [3]:
import bz2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import jieba

## word vector

从词向量预训练模型中建立词到向量映射

In [35]:
with bz2.open('/content/drive/MyDrive/Colab Notebooks Project/NLP/TouTiao_text/sgns.weibo.word.bz2') as f:
    info, *content = [x.decode('utf-8') for x in f.readlines()]

In [36]:
word_to_vector = {}

for ls in content:
    word, *vector = ls.split()
    word_to_vector[word] = np.array(vector).astype(np.float64)

vector_dim = int(info.split()[1])
word_to_vector['UNK'] = np.random.random(size=vector_dim)
word_to_vector['PAD'] = np.random.random(size=vector_dim)

## training data

1. 对训练数据的词转化成对应的id，如果预训练没有的词则统一变成‘UNK’标识 
2. 对文本数据进行初步清洗

In [39]:
df_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks Project/NLP/TouTiao_text/train.csv')
df_data['sentence_cut'] = df_data['sentence'].apply(jieba.lcut)
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53360 entries, 0 to 53359
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            53360 non-null  int64 
 1   label         53360 non-null  int64 
 2   label_desc    53360 non-null  object
 3   sentence      53360 non-null  object
 4   sentence_cut  53360 non-null  object
dtypes: int64(2), object(3)
memory usage: 2.0+ MB


In [40]:
word_to_id = {'UNK': 0, 'PAD': 1}
id_to_word = {0: 'UNK', 1: 'PAD'}
id_for_sentence = []

id = 2
for sentence in df_data['sentence_cut']:
    id_list = []
    for word in sentence:

        if word not in word_to_vector:
            id_list.append(word_to_id['UNK'])
        
        else:
            
            if word not in word_to_id:
                word_to_id[word] = id
                id_to_word[id] = word
                id += 1 

            id_list.append(word_to_id[word])

    id_for_sentence.append(id_list)

df_data['id_for_sentence'] = id_for_sentence

In [41]:
df_data['id_for_sentence']

0        [2, 3, 4, 0, 5, 6, 7, 8, 9, 4, 10, 11, 6, 12, ...
1        [0, 23, 24, 25, 26, 27, 28, 0, 29, 30, 31, 32,...
2        [40, 41, 30, 42, 11, 43, 6, 44, 45, 46, 11, 6,...
3                  [53, 32, 54, 55, 56, 57, 0, 58, 59, 22]
4               [60, 34, 61, 62, 6, 0, 63, 64, 65, 66, 67]
                               ...                        
53355    [405, 447, 290, 0, 15918, 966, 81, 966, 13517,...
53356        [1379, 4833, 158, 84, 36245, 40657, 34, 2967]
53357    [44491, 848, 44492, 44493, 1264, 70, 11301, 75...
53358    [116, 107, 590, 34, 485, 3955, 6, 9846, 899, 2...
53359    [9619, 31951, 30867, 391, 6, 5303, 4172, 292, ...
Name: id_for_sentence, Length: 53360, dtype: object

## embedding matrix

构建词向量的矩阵

In [42]:
embedding_mat = np.zeros(shape=(len(word_to_id), vector_dim))

for i in range(len(embedding_mat)):
    embedding_mat[i] = word_to_vector[id_to_word[i]]

In [50]:
embedding_mat.shape

(44495, 300)

In [52]:
word_to_vector

195196

In [49]:
word_to_vector['大家'] == embedding_mat[17]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,