测试数据预处理

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import time
import random
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from collections import Counter

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Using TensorFlow backend.


In [2]:
data_dir = "../input/"
train_file = os.path.join(data_dir, "train.csv")
test_file = os.path.join(data_dir, "test.csv")
embedding_size = 300
max_len = 50
max_features = 100000
batch_size = 256

In [3]:
train_df = pd.read_csv(train_file)
# test_df = pd.read_csv(test_file)
print("Train shape : ",train_df.shape)
# print("Test shape : ",test_df.shape)

# data cleaning
train_df["question_text"] = train_df["question_text"].str.lower()
# test_df["question_text"] = test_df["question_text"].str.lower()

## fill up the missing values
train_X = train_df["question_text"].fillna("_NA_").values
# test_X = test_df["question_text"].fillna("_##_").values

train_X[:5]

Train shape :  (1306122, 3)


array(['how did quebec nationalists see their province as a nation in the 1960s?',
       'do you have an adopted dog, how would you encourage people to adopt and not shop?',
       'why does velocity affect time? does velocity affect space geometry?',
       'how did otto von guericke used the magdeburg hemispheres?',
       'can i convert montra helicon d to a mountain bike by just changing the tyres?'],
      dtype=object)

In [4]:
# # 加入30个停用词
# filters = []
# standard_filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n'
# for s in standard_filters:
#     filters.append(s)
# stop_words = ['does', 'a', 'that', 'to', 'or', 'in', 'if', 'the', 'how', 'can', 'have', 'and', 'of', 'what', 'you', 'be', 'from', 'an',\
#               'why', 'on', 'with', 'which', 'are', 'your', 'do', 'my', 'i', 'is', 'it', 'for']
# filters.extend(stop_words)
# print(filters)

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)

train_Y = train_df['target'].values
print(np.sum(train_Y))

# remove_words = []
# for x in train_X:
#     remove_words.append([i for i in x if i>40])
# train_X = remove_words
train_X[:5]

80810


[[9, 48, 6683, 7219, 158, 55, 6107, 36, 4, 1206, 6, 1, 8333],
 [11, 14, 24, 29, 3864, 498, 9, 35, 14, 3672, 37, 5, 3089, 10, 44, 1846],
 [16, 26, 2002, 374, 70, 26, 2002, 374, 451, 5546],
 [9, 48, 13005, 8284, 52192, 119, 1, 39877, 28269],
 [15, 8, 1130, 42987, 99430, 911, 5, 4, 3133, 1533, 46, 96, 1465, 1, 9340]]

In [5]:
DATA_SPLIT_SEED = 2018
splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=DATA_SPLIT_SEED).split(train_X, train_Y))

In [6]:
for x in train_X:
    if len(x) == 0:
        print(x)

[]
[]
[]
[]
[]
[]
[]
[]
[]


In [7]:
# 正样本欠采样，负样本数据增强
def data_augmentation(X, Y, under_sample=200000, aug=2):
    """
    under_sample: 欠采样个数
    aug: 数据增强倍数
    """
    pos_X = []
    neg_X = []
    for i in range(len(X)):
        if Y[i] == 1:
            neg_X.append(X[i])
        else:
            pos_X.append(X[i])
#     print(len(pos_X))
#     print(len(neg_X))
    
    # 正样本欠采样
    random.shuffle(pos_X)
    pos_X = pos_X[:-under_sample]
    # 负样本数据增强
    neg_X2 = []
    for x in neg_X:
        random.shuffle(x)
        neg_X2.append(x)
        random.shuffle(x)
        neg_X2.append(x)
    neg_X.extend(neg_X2)
    
#     print(len(pos_X))
#     print(len(neg_X))
    
    pos_Y = np.zeros(shape=[len(pos_X)], dtype=np.int32)
    neg_Y = np.ones(shape=[len(neg_X)], dtype=np.int32)
    
    return pos_X+neg_X, np.append(pos_Y, neg_Y)

train_X, train_Y = data_augmentation(train_X, train_Y)

In [21]:
index = 0
for x in train_X:
    index += 1
    if len(x)==0:
        print(x)

[]
[]
[]
[]
[]
[]
[]
[]
[]


TypeError: object of type 'NoneType' has no len()

In [22]:
index

1106123

In [10]:
len_num = 0
for x in train_X:
    if len(x)>=20:
        len_num += 1
len_num

38354

In [5]:
tokenizer.texts_to_sequences(np.array(['dsjdhsjhdsdh make love']))

[[62, 143]]

In [11]:
def get_key (dict, value):
    return [k for k, v in dict.items() if v <= value]
print(get_key(tokenizer.word_index, 30))

['does', 'a', 'that', 'to', 'or', 'in', 'if', 'the', 'how', 'can', 'have', 'and', 'of', 'what', 'you', 'be', 'from', 'an', 'why', 'on', 'with', 'which', 'are', 'your', 'do', 'my', 'i', 'is', 'it', 'for']


In [7]:
train_X = pad_sequences(train_X, maxlen=max_len, padding="post", truncating="post")
train_X[:5]

array([[    9,    48,  6683,  7219,   158,    55,  6107,    36,     4,
         1206,     6,     1,  8333,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [   11,    14,    24,    29,  3864,   498,     9,    35,    14,
         3672,    37,     5,  3089,    10,    44,  1846,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [   16,    26,  2002,   374,    70,    26,  2002,   374,   451,
         5546,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,    

In [8]:
train_X = np.where(train_X>=40, train_X, 0)
train_X[:5]


array([[    0,    48,  6683,  7219,   158,    55,  6107,     0,     0,
         1206,     0,     0,  8333,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [    0,     0,     0,     0,  3864,   498,     0,     0,     0,
         3672,     0,     0,  3089,     0,    44,  1846,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [    0,     0,  2002,   374,    70,     0,  2002,   374,   451,
         5546,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,    