In [1]:
#coding=utf-8
'''
    FileName      ：CNNandLabel.ipynb
    Author        ：@zch0423
    Date          ：Jun 25, 2021
    Description   ：
'''
import pickle
import gensim
import pandas as pd

In [2]:
w2v_path = "/Users/zch/Desktop/IM319_NLP.nosync/hw/pre_loaded/glove_twitter_200.bin"

In [3]:
with open(w2v_path, "rb") as f:
    pre_trained_w2v = pickle.load(f)
    f.close()

In [4]:
def loadData(path):
    '''
    @Description
    返回文本
    ------------
    @Params
    path, str
    ------------
    @Returns
    X
    '''
    return pd.read_csv(path, index_col=0).post.fillna("")

In [20]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
import tensorflow.keras as keras
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import initializers
# 结果可复现
np.random.seed(0)
tf.random.set_seed(0) 
def transform_feature(X_train, X_valid, X_test, MAX_LENGTH, pre_trained_w2v, dim_embedding):
    '''
    @Description
    返回等长句子和embedding
    ------------
    @Params
    X_train, X_valid, X_test, 文本数据
    MAX_LENGTH, int, 最长长度
    pre_trained_w2v, 预训练模型
    dim_embedding, int
    ------------
    @Returns
    X_train, X_valid, X_test, index2emb
    '''
    tokenizer = Tokenizer(filters='', lower=True) # 考虑标点符号，忽略大小写
    tokenizer.fit_on_texts(X_train)
    #word2index = tokenizer.word_index
    index2word = tokenizer.index_word # 从1开始，0预留给了 <pad>

    X_train = tokenizer.texts_to_sequences(X_train)
    X_valid = tokenizer.texts_to_sequences(X_valid)
    X_test = tokenizer.texts_to_sequences(X_test)

    # embedding
    index2emb = [np.zeros(dim_embedding)] # index 0 means <pad>

    for i in range(1, len(index2word) + 1):
        if index2word[i] in pre_trained_w2v:
            index2emb.append(pre_trained_w2v[index2word[i]])
        else:
            index2emb.append(np.random.uniform(-0.05, 0.05, dim_embedding))
    index2emb = np.array(index2emb)
    max_length = min(max(len(x) for x in X_train), MAX_LENGTH)
    X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
    X_valid = pad_sequences(X_valid, maxlen=max_length, padding='post')
    X_test = pad_sequences(X_test, maxlen=max_length, padding='post')
    return X_train, X_valid, X_test, index2emb

In [21]:
path = "/Users/zch/Desktop/IM319_NLP.nosync/project/rawData/preprocessed/SBIC.v2.%s.csv"
X_valid = loadData(path%"dev")
X_train = loadData(path%"trn")
X_test = loadData(path%"tst")

In [1]:
MAX_LENGTH = 60
dim_embedding = len(pre_trained_w2v["the"])
X_train, X_valid, X_test, index2emb = transform_feature(X_train, X_valid, X_test, MAX_LENGTH, pre_trained_w2v, dim_embedding)

In [23]:
out_dir = "/Users/zch/Desktop/IM319_NLP.nosync/project/data/CNN/"
np.save(out_dir+"X_train.npy", X_train)

In [24]:
np.save(out_dir+"X_valid.npy", X_valid)
np.save(out_dir+"X_test.npy", X_test)
np.save(out_dir+"index2emb.npy", index2emb)

### 处理y标签



In [None]:
label_path = "/Users/zch/Desktop/IM319_NLP.nosync/project/data/labels/%s.npy"
dev_label = np.load(label_path%"dev")
trn_label = np.load(label_path%"trn")
tst_label = np.load(label_path%"tst")

In [15]:

def refineLabel(labels):
    def foo(z: float):
        if np.isnan(z):
            return 0
        return 1 if z>0 else z

    def foo2(z: float):
        if np.isnan(z):
            return 0
        return 1 if z==0 else z
    l = []
    weights = []
    for i in range(len(labels)):
        l.append(list(map(foo, labels[i])))
        weights.append(list(map(foo2, labels[i])))
    return np.array(l), np.array(weights)

In [16]:
dev_l, dev_w = refineLabel(dev_label)

In [21]:
def saveLabelandWeight(labels, weights, name="dev"):
    out_path = "/Users/zch/Desktop/IM319_NLP.nosync/project/data/CNN/"
    np.save(out_path+name+"_labels.npy", labels)
    np.save(out_path+name+"_w.npy", weights)

In [22]:
# saveLabelandWeight(dev_l, dev_w, name="dev")

In [27]:
# saveLabelandWeight(*refineLabel(trn_label), name="trn")
# saveLabelandWeight(*refineLabel(tst_label), name="tst")

In [28]:
t = np.load("/Users/zch/Desktop/IM319_NLP.nosync/project/data/CNN/trn_w.npy")