# CNN Model for Occupational Coding

In [None]:
import pandas as pd # Read data and output

import jieba # Word segmentation
import jieba.analyse as analyse

import numpy as np # Data processing

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, MaxPool1D, Conv1D, Convolution1D
from keras.layers import Embedding
from keras import regularizers
from keras.layers import BatchNormalization
from keras.models import load_model

from gensim.models import KeyedVectors # Import pre-trained word vectos

import matplotlib.pyplot as plt # Draw graph

## Set Up
Set up for training the model<br>
Variables needed to be set: word2vec_path, stopwords_path, vec_size

In [1]:
# Set the file paths needed and dimension of the pre-trained word vectors

stopwords_path = 'stopword.txt' # Stop-words file
word2vec_path = 'tencent-ailab-embedding-zh-d200-v0.2.0-s.txt' # Pre-trained word vectors file

vec_size = 200 # Deimension of the pre-trained word vectors

In [None]:
# Import pre-trained word vectors

wv_from_text = KeyedVectors.load_word2vec_format(word2vec_path, binary=False)

In [None]:
# Declare functions used

# Create list of stop-words
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords
 
 
# Word segmentation
def seg_sentence(sentence):
    sentence_seged = jieba.lcut(sentence.strip())
    stopwords = stopwordslist(stopwords_path)
    outstr = []
    for word in sentence_seged:
        if word not in stopwords: # Delete the stop-words
            if word != '\t' and word != ' ': # Delete spaces
                outstr.append(word)
    return outstr

# Change the words segemented into word vectors
def transform_to_matrix(x, padding_size=10, vec_size=vec_size):
    res = []
    for sen in x:
        matrix = []
        for i in range(padding_size):
            try:
                matrix.append(wv_from_text[sen[i]].tolist())
            except:
                matrix.append([0] * vec_size)
        res.append(matrix)
    return res

## Model for Coding

### Create Model

In [None]:
# Create the model; add convolution layers, pooling layer, etc.
model = Sequential()
model.add(Conv1D(256,
                3,
                padding = 'same',
                activation = 'relu'))
model.add(MaxPool1D(3,3,padding='same'))
model.add(Conv1D(32,3,padding='same', activation='relu'))
model.add(Flatten())
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units = 100000, activation = 'softmax'))

model.compile(loss = "sparse_categorical_crossentropy",
             optimizer = "adam",
             metrics = ["accuracy"])

### Data Processing
Variables needed to be set: training_data_path, train_sample, train_label

In [None]:
# Set file paths

training_data_path = 'hy_training.xlsx' # Training data

In [None]:
# Import training data and tags
train_pd = pd.read_excel(training_data_path, index_col=0)

train_sample = 'QG302' # Column name of data
train_label = 'QG302CODE' # Column name of tags

# Get data and tags and stored as Numpy arrays
x_train = train_pd[train_sample] # training data
x_train = np.array(x_train)
y_train = train_pd[train_label] # tags
y_train = np.array(y_train)

In [None]:
# Call seg_sentence function to do segmentation
temp = []
for i in x_train:
    line_seg = seg_sentence(i)
    temp.append(line_seg)

x_train = temp

# Convert the variable x_train tp matrix
x_train = transform_to_matrix(x_train)

# Store x_train and y_train as Numpy arrays
x_train = np.array(x_train)
y_train = np.array(y_train)

### Train Model
Variables needed to be set: batch_size, epochs, val_split

In [None]:
# Set parameters
batch_size = 128 # Batch size
epochs = 30 # Number of epochs
val_split = 0.1 # Test data percentage

In [None]:
# Train the model
# Output shows training loss(loss), training accuracy(accuracy), validation loss(val_loss), and validation accuracy(val_acc)
history = model.fit(
            x_train,
            y_train,
            validation_split = val_split,
            batch_size = batch_size,
            epochs = epochs
            )

In [None]:
# Draw graph of loss and accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('hangye model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.grid()
plt.xticks(np.arange(0,31,2))
plt.yticks(np.arange(0,1,0.05))
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('hangye model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.grid()
plt.show()

### Save Model
Variables needed to be set: save_model_path

In [None]:
# Save the model
save_model_path = 'model_hy.h5' # Path of the model to be saved
model.save(save_model_path)