In [5]:
import warnings
warnings.filterwarnings('ignore')

## configure & util

In [4]:
class param:
    '''
    there are total 100,000 samples in the train file
    split train data into train' & test' and use train' for cross validation
    '''
    train_num = 75000 
    test_num = 25000
    w2v_dim = 300

    seed = 2017

# record data column names
columns = ['Age', 'Gender', 'Education', 'Queries']

In [6]:
import time

class util:
    @staticmethod
    def log(stri):
        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        print(str(now) + ' ' + str(stri))

## tfidf stacking features

In [4]:
# coding=utf-8
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

############################ 定义评估函数 ############################
def micro_avg_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='micro')


############################ 加载数据 ############################
df_all = pd.read_csv('./data/processed_train.csv', encoding='utf8')

############################ tfidf ############################
tfv = TfidfVectorizer(min_df=3, max_df=0.95, sublinear_tf=True)
x_sp = tfv.fit_transform(df_all['Queries'])

In [11]:
x_sp.shape

(100000, 285214)

In [None]:
import param
import util

In [25]:
############################ lr stack ############################
tr_num = param.train_num
num_class = len(pd.value_counts(df_all['Gender']))
n = 5

x = x_sp[:tr_num]
y = df_all['Gender'][:tr_num]
x_te = x_sp[tr_num:]
# y_te = df_all['Gender'][tr_num:]

stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

skf = StratifiedKFold(n_splits=n, random_state=param.seed)

score_va = 0
# score_te = 0
i = 0
for tr, va in skf.split(x, y):
    util.log('stack:%d/%d' % ((i + 1), n))
    clf = LogisticRegression(C=2)
    clf.fit(x[tr], y[tr])
    y_pred_va = clf.predict_proba(x[va])
    y_pred_te = clf.predict_proba(x_te)
    util.log('va acc:%f' % micro_avg_f1(y[va], clf.predict(x[va])))
#     util.log('te acc:%f' % micro_avg_f1(y_te, clf.predict(x_te)))
    score_va += micro_avg_f1(y[va], clf.predict(x[va]))
#     score_te += micro_avg_f1(y_te, clf.predict(x_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
    i += 1
score_va /= n
# score_te /= n
util.log('va avg acc:%f' % score_va)
# util.log('te avg acc:%f' % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_lr_{}'.format(i)] = stack_all[:, i]

df_stack.to_csv('./output/feature/tfidf/lr_prob_21w.csv', index=None, encoding='utf8')



2021-02-09 21:22:00 stack:1/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2021-02-09 21:22:33 va acc:0.798733
2021-02-09 21:22:33 stack:2/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2021-02-09 21:23:09 va acc:0.796600
2021-02-09 21:23:09 stack:3/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2021-02-09 21:23:44 va acc:0.795733
2021-02-09 21:23:44 stack:4/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2021-02-09 21:24:18 va acc:0.799400
2021-02-09 21:24:18 stack:5/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2021-02-09 21:24:52 va acc:0.803800
2021-02-09 21:24:52 va avg acc:0.798853


In [23]:
df_stack.head()

Unnamed: 0,tfidf_lr_0,tfidf_lr_1,tfidf_lr_2
0,0.031184,0.415349,0.553467
1,0.019671,0.893681,0.086648
2,0.008075,0.895271,0.096654
3,0.00942,0.034293,0.956287
4,0.012034,0.051781,0.936184


In [27]:
############################ bnb stack ############################
from sklearn.naive_bayes import BernoulliNB

tr_num = param.train_num
num_class = len(pd.value_counts(df_all['Gender']))
n = 5

x = x_sp[:tr_num]
y = df_all['Gender'][:tr_num]
x_te = x_sp[tr_num:]
# y_te = df_all['Gender'][tr_num:]

stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

skf = StratifiedKFold(n_splits=n, random_state=param.seed)

score_va = 0
# score_te = 0
i = 0
for tr, va in skf.split(x, y):
    util.log('stack:%d/%d' % ((i + 1), n))
    clf = BernoulliNB()
    clf.fit(x[tr], y[tr])
    y_pred_va = clf.predict_proba(x[va])
    y_pred_te = clf.predict_proba(x_te)
    util.log('va acc:%f' % micro_avg_f1(y[va], clf.predict(x[va])))
#     util.log('te acc:%f' % micro_avg_f1(y_te, clf.predict(x_te)))
    score_va += micro_avg_f1(y[va], clf.predict(x[va]))
#     score_te += micro_avg_f1(y_te, clf.predict(x_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
    i += 1
score_va /= n
# score_te /= n
util.log('va avg acc:%f' % score_va)
# util.log('te avg acc:%f' % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_bnb_{}'.format(i)] = stack_all[:, i]

df_stack.to_csv('./output/feature/tfidf/bnb_prob_21w.csv', index=None, encoding='utf8')



2021-02-09 21:29:14 stack:1/5
2021-02-09 21:29:15 va acc:0.797600
2021-02-09 21:29:15 stack:2/5
2021-02-09 21:29:15 va acc:0.798000
2021-02-09 21:29:15 stack:3/5
2021-02-09 21:29:16 va acc:0.793533
2021-02-09 21:29:16 stack:4/5
2021-02-09 21:29:17 va acc:0.801533
2021-02-09 21:29:17 stack:5/5
2021-02-09 21:29:18 va acc:0.802600
2021-02-09 21:29:18 va avg acc:0.798653


In [28]:
############################ mnb stack ############################
from sklearn.naive_bayes import MultinomialNB

tr_num = param.train_num
num_class = len(pd.value_counts(df_all['Gender']))
n = 5

x = x_sp[:tr_num]
y = df_all['Gender'][:tr_num]
x_te = x_sp[tr_num:]
# y_te = df_all['Gender'][tr_num:]

stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

skf = StratifiedKFold(n_splits=n, random_state=param.seed)

score_va = 0
# score_te = 0
i = 0
for tr, va in skf.split(x, y):
    util.log('stack:%d/%d' % ((i + 1), n))
    clf = MultinomialNB()
    clf.fit(x[tr], y[tr])
    y_pred_va = clf.predict_proba(x[va])
    y_pred_te = clf.predict_proba(x_te)
    util.log('va acc:%f' % micro_avg_f1(y[va], clf.predict(x[va])))
#     util.log('te acc:%f' % micro_avg_f1(y_te, clf.predict(x_te)))
    score_va += micro_avg_f1(y[va], clf.predict(x[va]))
#     score_te += micro_avg_f1(y_te, clf.predict(x_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
    i += 1
score_va /= n
# score_te /= n
util.log('va avg acc:%f' % score_va)
# util.log('te avg acc:%f' % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_mnb_{}'.format(i)] = stack_all[:, i]

df_stack.to_csv('./output/feature/tfidf/mnb_prob_21w.csv', index=None, encoding='utf8')



2021-02-09 21:45:07 stack:1/5
2021-02-09 21:45:07 va acc:0.777800
2021-02-09 21:45:07 stack:2/5
2021-02-09 21:45:08 va acc:0.775933
2021-02-09 21:45:08 stack:3/5
2021-02-09 21:45:08 va acc:0.779000
2021-02-09 21:45:08 stack:4/5
2021-02-09 21:45:09 va acc:0.780800
2021-02-09 21:45:09 stack:5/5
2021-02-09 21:45:09 va acc:0.781200
2021-02-09 21:45:09 va avg acc:0.778947


## doc2vec stacking features

### doc2vec preprocess

In [32]:
# coding=utf-8
import codecs
import subprocess
from collections import namedtuple

import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


############################ 加载数据 ############################
df_all = pd.read_csv('./data/processed_train.csv', encoding='utf8').reset_index()


############################ 定义函数、类及变量 ############################
def run_cmd(cmd):
    print(cmd)
    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    for t, line in enumerate(iter(process.stdout.readline, b'')):
        line = line.decode('utf8').rstrip()
        print(line)
    process.communicate()
    return process.returncode


SentimentDocument = namedtuple('SentimentDocument', 'words tags')


class Doc_list(object):
    def __init__(self, f):
        self.f = f
    def __iter__(self):
        for i,line in enumerate(codecs.open(self.f,encoding='utf8')):
            words = line.strip().split(' ')
            tags = [int(words[0][2:])]
            words = words[1:]
            yield SentimentDocument(words,tags)

In [33]:
############################ 准备数据 ############################
doc_f = codecs.open('./output/corpus/doc_for_d2v_21w.txt', 'w', encoding='utf8')
for i, contents in enumerate(df_all.iloc[:(param.train_num+param.test_num)]['Queries']):
    words = []
    for word in contents.split(' '):
        words.append(word)
    tags = [i]
    if i % 10000 == 0:
        util.log('iter = %d' % i)
    doc_f.write(u'_*{} {}\n'.format(i, ' '.join(words)))
doc_f.close()

2021-02-09 22:13:37 iter = 0
2021-02-09 22:13:38 iter = 10000
2021-02-09 22:13:39 iter = 20000
2021-02-09 22:13:40 iter = 30000
2021-02-09 22:13:41 iter = 40000
2021-02-09 22:13:42 iter = 50000
2021-02-09 22:13:43 iter = 60000
2021-02-09 22:13:44 iter = 70000
2021-02-09 22:13:45 iter = 80000
2021-02-09 22:13:46 iter = 90000


### dbow stacking doc2vec

In [34]:
############################ dbow d2v ############################
d2v = Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=3, window=30, sample=1e-5, workers=8, alpha=0.025, min_alpha=0.025)
doc_list = Doc_list('./output/corpus/doc_for_d2v_21w.txt')
d2v.build_vocab(doc_list)

df_lb = df_all['Gender']

# use fewer iterations
for i in range(5):
    util.log('pass: ' + str(i))
    #     run_cmd('shuf alldata-id.txt > alldata-id-shuf.txt')
    doc_list = Doc_list('./output/corpus/doc_for_d2v_21w.txt')
    d2v.train(doc_list, total_examples=d2v.corpus_count, epochs=d2v.iter)
    X_d2v = np.array([d2v.docvecs[i] for i in range(param.train_num+param.test_num)])
    scores = cross_val_score(LogisticRegression(C=3), X_d2v, df_lb, cv=5)
    util.log('dbow: ' + str(scores) + ' ' + str(np.mean(scores)))
d2v.save('./output/model/dbow_d2v_21w.model')
util.log('Save done!')

2021-02-09 22:20:01 pass: 0
2021-02-09 22:24:12 dbow: [0.8097  0.80665 0.81315 0.81525 0.813  ] 0.8115499999999999
2021-02-09 22:24:12 pass: 1
2021-02-09 22:28:14 dbow: [0.8168 0.812  0.8173 0.8208 0.8167] 0.8167199999999999
2021-02-09 22:28:14 pass: 2
2021-02-09 22:32:40 dbow: [0.81725 0.81195 0.81755 0.82075 0.81955] 0.81741
2021-02-09 22:32:40 pass: 3
2021-02-09 22:37:04 dbow: [0.81545 0.8133  0.81645 0.8206  0.8196 ] 0.8170800000000001
2021-02-09 22:37:04 pass: 4
2021-02-09 22:41:37 dbow: [0.81765 0.8118  0.81805 0.8221  0.8197 ] 0.8178599999999999


AttributeError: type object 'param' has no attribute 'data_path'

In [35]:
d2v.save('./output/model/dbow_d2v_21w.model')
util.log('Save done!')

2021-02-09 22:43:10 Save done!


In [42]:
# coding=utf-8
import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.utils import np_utils
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score


############################ 定义评估函数 ############################
def micro_avg_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='micro')


############################ 加载数据 ############################
df_all = pd.read_csv('./data/processed_train.csv', encoding='utf8')

model = Doc2Vec.load('./output/model/dbow_d2v_21w.model')
x_sp = np.array([model.docvecs[i] for i in range(param.train_num+param.test_num)])


############################ dbowd2v stack ############################
np.random.seed(param.seed) # 固定种子，方便复现
df_stack = pd.DataFrame(index=range(len(df_all)))
tr_num = param.train_num
num_class = len(pd.value_counts(df_all['Gender']))
n = 5

x = x_sp[:tr_num]
y = df_all['Gender'][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all['Gender'][tr_num:]

feat = 'dbowd2v'
stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

skf = StratifiedKFold(n_splits=n, random_state=param.seed)

score_va = 0
score_te = 0
i = 0
for tr, va in skf.split(x, y):
    util.log('stack:%d/%d' % ((i + 1), n))
    y_train = np_utils.to_categorical(y[tr], num_class)
    y_test = np_utils.to_categorical(y_te, num_class)
    model = Sequential()
    model.add(Dense(300, input_shape=(x[tr].shape[1],)))
    model.add(Dropout(0.1))
    model.add(Activation('tanh'))
    model.add(Dense(num_class))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adadelta',
                  metrics=['accuracy'])
    history = model.fit(x[tr], y_train, shuffle=True,
                        batch_size=128, epochs=35,
                        verbose=2)
    y_pred_va = model.predict_proba(x[va])
    y_pred_te = model.predict_proba(x_te)
    util.log('va acc:%f' % micro_avg_f1(y[va], model.predict_classes(x[va])))
    util.log('te acc:%f' % micro_avg_f1(y_te, model.predict_classes(x_te)))
    score_va += micro_avg_f1(y[va], model.predict_classes(x[va]))
    score_te += micro_avg_f1(y_te, model.predict_classes(x_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
    i += 1
score_va /= n
score_te /= n
util.log('va avg acc:%f' % score_va)
util.log('te avg acc:%f' % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
for l in range(stack_all.shape[1]):
    df_stack['{}_{}'.format(feat, l)] = stack_all[:, l]

df_stack.to_csv('./output/feature/dbowd2v/nn_prob_21w.csv', encoding='utf8', index=None)
util.log('Save dbowd2v stack done!')

2021-02-10 00:12:43 stack:1/5
Epoch 1/35
469/469 - 2s - loss: 1.1389 - accuracy: 0.3710
Epoch 2/35
469/469 - 1s - loss: 1.0051 - accuracy: 0.4667
Epoch 3/35
469/469 - 1s - loss: 0.9089 - accuracy: 0.5303
Epoch 4/35
469/469 - 1s - loss: 0.8433 - accuracy: 0.5710
Epoch 5/35
469/469 - 1s - loss: 0.7957 - accuracy: 0.6002
Epoch 6/35
469/469 - 1s - loss: 0.7605 - accuracy: 0.6243
Epoch 7/35
469/469 - 1s - loss: 0.7319 - accuracy: 0.6452
Epoch 8/35
469/469 - 1s - loss: 0.7101 - accuracy: 0.6630
Epoch 9/35
469/469 - 1s - loss: 0.6906 - accuracy: 0.6789
Epoch 10/35
469/469 - 1s - loss: 0.6739 - accuracy: 0.6956
Epoch 11/35
469/469 - 1s - loss: 0.6596 - accuracy: 0.7084
Epoch 12/35
469/469 - 1s - loss: 0.6473 - accuracy: 0.7189
Epoch 13/35
469/469 - 1s - loss: 0.6365 - accuracy: 0.7291
Epoch 14/35
469/469 - 1s - loss: 0.6272 - accuracy: 0.7355
Epoch 15/35
469/469 - 1s - loss: 0.6190 - accuracy: 0.7442
Epoch 16/35
469/469 - 1s - loss: 0.6111 - accuracy: 0.7519
Epoch 17/35
469/469 - 1s - loss: 0.

Epoch 30/35
469/469 - 1s - loss: 0.5599 - accuracy: 0.7904
Epoch 31/35
469/469 - 1s - loss: 0.5581 - accuracy: 0.7918
Epoch 32/35
469/469 - 1s - loss: 0.5562 - accuracy: 0.7941
Epoch 33/35
469/469 - 1s - loss: 0.5549 - accuracy: 0.7952
Epoch 34/35
469/469 - 1s - loss: 0.5536 - accuracy: 0.7953
Epoch 35/35
469/469 - 1s - loss: 0.5513 - accuracy: 0.7975
2021-02-10 00:15:14 va acc:0.800933
2021-02-10 00:15:14 te acc:0.805240
2021-02-10 00:15:15 stack:5/5
Epoch 1/35
469/469 - 1s - loss: 1.2053 - accuracy: 0.3307
Epoch 2/35
469/469 - 1s - loss: 1.0277 - accuracy: 0.4612
Epoch 3/35
469/469 - 1s - loss: 0.9106 - accuracy: 0.5397
Epoch 4/35
469/469 - 1s - loss: 0.8366 - accuracy: 0.5805
Epoch 5/35
469/469 - 1s - loss: 0.7874 - accuracy: 0.6096
Epoch 6/35
469/469 - 1s - loss: 0.7516 - accuracy: 0.6315
Epoch 7/35
469/469 - 1s - loss: 0.7236 - accuracy: 0.6513
Epoch 8/35
469/469 - 1s - loss: 0.7028 - accuracy: 0.6673
Epoch 9/35
469/469 - 1s - loss: 0.6835 - accuracy: 0.6849
Epoch 10/35
469/469 - 

### dm stacking doc2vec

In [36]:
############################ dm d2v ############################
d2v = Doc2Vec(dm=1, size=300, negative=5, hs=0, min_count=3, window=30, sample=1e-5, workers=8, alpha=0.025, min_alpha=0.025)
doc_list = Doc_list('./output/corpus/doc_for_d2v_21w.txt')
d2v.build_vocab(doc_list)

df_lb = df_all['Gender']

# change from 10 to 5
for i in range(5):
    util.log('pass: ' + str(i))
    #     run_cmd('shuf alldata-id.txt > alldata-id-shuf.txt')
    doc_list = Doc_list('./output/corpus/doc_for_d2v_21w.txt')
    d2v.train(doc_list, total_examples=d2v.corpus_count, epochs=d2v.iter)
    X_d2v = np.array([d2v.docvecs[i] for i in range(param.train_num+param.test_num)])
    scores = cross_val_score(LogisticRegression(C=3), X_d2v, df_lb, cv=5)
    util.log('dm: ' + str(scores) + ' ' + str(np.mean(scores)))
d2v.save('./output/model/dm_d2v_21w.model')
util.log('Save done!')

2021-02-09 22:45:32 pass: 0
2021-02-09 22:52:43 dm: [0.7127  0.72995 0.74365 0.73345 0.72595] 0.72914
2021-02-09 22:52:43 pass: 1
2021-02-09 23:10:41 dm: [0.73065 0.7332  0.74545 0.74175 0.73335] 0.73688
2021-02-09 23:10:41 pass: 2
2021-02-09 23:17:13 dm: [0.74635 0.74145 0.7538  0.7507  0.7435 ] 0.74716
2021-02-09 23:17:13 pass: 3
2021-02-09 23:23:54 dm: [0.7534  0.74835 0.76115 0.7577  0.75225] 0.75457
2021-02-09 23:23:54 pass: 4
2021-02-09 23:31:09 dm: [0.75815 0.7551  0.767   0.76385 0.7595 ] 0.7607200000000001
2021-02-09 23:31:18 Save done!


In [44]:
# coding=utf-8
import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.utils import np_utils
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score


############################ 定义评估函数 ############################
def micro_avg_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='micro')


############################ 加载数据 ############################
df_all = pd.read_csv('./data/processed_train.csv', encoding='utf8')

model = Doc2Vec.load('./output/model/dm_d2v_21w.model')
x_sp = np.array([model.docvecs[i] for i in range(param.train_num+param.test_num)])

############################ dmd2v stack ############################
np.random.seed(param.seed) # 固定种子，方便复现
df_stack = pd.DataFrame(index=range(len(df_all)))
tr_num = param.train_num
num_class = len(pd.value_counts(df_all['Gender']))
n = 5

x = x_sp[:tr_num]
y = df_all['Gender'][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all['Gender'][tr_num:]

feat = 'dmd2v'
stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

skf = StratifiedKFold(n_splits=n, random_state=param.seed)

score_va = 0
score_te = 0
i = 0
for tr, va in skf.split(x, y):
    util.log('stack:%d/%d' % ((i + 1), n))
    y_train = np_utils.to_categorical(y[tr], num_class)
    y_test = np_utils.to_categorical(y_te, num_class)
    model = Sequential()
    model.add(Dense(300, input_shape=(x[tr].shape[1],)))
    model.add(Dropout(0.1))
    model.add(Activation('tanh'))
    model.add(Dense(num_class))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adadelta',
                  metrics=['accuracy'])
    history = model.fit(x[tr], y_train, shuffle=True,
                        batch_size=128, epochs=35,
                        verbose=2)
    y_pred_va = model.predict_proba(x[va])
    y_pred_te = model.predict_proba(x_te)
    util.log('va acc:%f' % micro_avg_f1(y[va], model.predict_classes(x[va])))
    util.log('te acc:%f' % micro_avg_f1(y_te, model.predict_classes(x_te)))
    score_va += micro_avg_f1(y[va], model.predict_classes(x[va]))
    score_te += micro_avg_f1(y_te, model.predict_classes(x_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
    i += 1
score_va /= n
score_te /= n
util.log('va avg acc:%f' % score_va)
util.log('te avg acc:%f' % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
for l in range(stack_all.shape[1]):
    df_stack['{}_{}'.format(feat, l)] = stack_all[:, l]

df_stack.to_csv('./output/feature/dmd2v/nn_prob_21w.csv', encoding='utf8', index=None)
util.log('Save dmd2v stack done!')

2021-02-10 00:19:55 stack:1/5
Epoch 1/35
469/469 - 1s - loss: 1.1844 - accuracy: 0.3887
Epoch 2/35
469/469 - 1s - loss: 1.0095 - accuracy: 0.4839
Epoch 3/35
469/469 - 1s - loss: 0.9157 - accuracy: 0.5329
Epoch 4/35
469/469 - 1s - loss: 0.8619 - accuracy: 0.5601
Epoch 5/35
469/469 - 1s - loss: 0.8279 - accuracy: 0.5770
Epoch 6/35
469/469 - 1s - loss: 0.8008 - accuracy: 0.5950
Epoch 7/35
469/469 - 1s - loss: 0.7795 - accuracy: 0.6090
Epoch 8/35
469/469 - 1s - loss: 0.7631 - accuracy: 0.6185
Epoch 9/35
469/469 - 1s - loss: 0.7479 - accuracy: 0.6334
Epoch 10/35
469/469 - 1s - loss: 0.7364 - accuracy: 0.6414
Epoch 11/35
469/469 - 1s - loss: 0.7230 - accuracy: 0.6520
Epoch 12/35
469/469 - 1s - loss: 0.7132 - accuracy: 0.6591
Epoch 13/35
469/469 - 1s - loss: 0.7038 - accuracy: 0.6695
Epoch 14/35
469/469 - 1s - loss: 0.6968 - accuracy: 0.6745
Epoch 15/35
469/469 - 1s - loss: 0.6897 - accuracy: 0.6810
Epoch 16/35
469/469 - 1s - loss: 0.6828 - accuracy: 0.6871
Epoch 17/35
469/469 - 1s - loss: 0.

Epoch 30/35
469/469 - 1s - loss: 0.6388 - accuracy: 0.7280
Epoch 31/35
469/469 - 1s - loss: 0.6378 - accuracy: 0.7272
Epoch 32/35
469/469 - 2s - loss: 0.6363 - accuracy: 0.7298
Epoch 33/35
469/469 - 1s - loss: 0.6349 - accuracy: 0.7315
Epoch 34/35
469/469 - 1s - loss: 0.6339 - accuracy: 0.7306
Epoch 35/35
469/469 - 1s - loss: 0.6312 - accuracy: 0.7328
2021-02-10 00:22:38 va acc:0.742467
2021-02-10 00:22:39 te acc:0.738520
2021-02-10 00:22:40 stack:5/5
Epoch 1/35
469/469 - 2s - loss: 1.1194 - accuracy: 0.4228
Epoch 2/35
469/469 - 1s - loss: 0.9756 - accuracy: 0.5009
Epoch 3/35
469/469 - 1s - loss: 0.8971 - accuracy: 0.5415
Epoch 4/35
469/469 - 1s - loss: 0.8500 - accuracy: 0.5653
Epoch 5/35
469/469 - 1s - loss: 0.8194 - accuracy: 0.5832
Epoch 6/35
469/469 - 1s - loss: 0.7939 - accuracy: 0.5993
Epoch 7/35
469/469 - 1s - loss: 0.7742 - accuracy: 0.6142
Epoch 8/35
469/469 - 1s - loss: 0.7580 - accuracy: 0.6257
Epoch 9/35
469/469 - 1s - loss: 0.7457 - accuracy: 0.6354
Epoch 10/35
469/469 - 

## word2vec features

In [37]:
# coding=utf-8
from collections import defaultdict
import pandas as pd
from gensim.models import Word2Vec


############################ 加载数据 ############################
df_all = pd.read_csv('./data/processed_train.csv', encoding='utf8')


############################ w2v ############################
documents = df_all['Queries'].values
util.log('documents number %d' % len(documents))

texts = [[word for word in document.split(' ')] for document in documents]
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] >= 5] for text in texts]

util.log('Train Model...')
w2v = Word2Vec(texts, size=param.w2v_dim, window=5, iter=15, workers=12, seed=param.seed)
w2v.save('./output/model/w2v_21w.model')
util.log('Save done!')

2021-02-09 23:36:30 documents number 100000
2021-02-09 23:37:14 Train Model...
2021-02-09 23:51:17 Save done!


In [39]:
# coding=utf-8
from collections import defaultdict
import numpy as np
import pandas as pd
from gensim.models import Word2Vec


############################ 加载数据 & 模型 ############################
df_all = pd.read_csv('./data/processed_train.csv', encoding='utf8')
documents = df_all['Queries'].values
texts = [[word for word in document.split(' ')] for document in documents]
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] >= 5] for text in texts]

model = Word2Vec.load('./output/model/w2v_21w.model')


############################ w2v ############################
util.log('Start get w2v feat..')
w2v_feat = np.zeros((len(texts), param.w2v_dim))
w2v_feat_avg = np.zeros((len(texts), param.w2v_dim))
i = 0
for line in texts:
    num = 0
    for word in line:
        num += 1
        vec = model[word]
        w2v_feat[i, :] += vec
    w2v_feat_avg[i, :] = w2v_feat[i, :] / num
    i += 1
    if i % 10000 == 0:
        util.log(i)

df_w2v = pd.DataFrame(w2v_feat)
df_w2v.columns = ['w2v_' + str(i) for i in df_w2v.columns]
df_w2v.to_csv('./output/feature/w2v/w2v_21w.csv', encoding='utf8', index=None)
df_w2v_avg = pd.DataFrame(w2v_feat_avg)
df_w2v_avg.columns = ['w2v_avg_' + str(i) for i in df_w2v_avg.columns]
df_w2v_avg.to_csv('./output/feature/w2v/w2v_avg_21w.csv', encoding='utf8', index=None)

util.log('Save w2v and w2v_avg feat done!')

2021-02-09 23:53:25 Start get w2v feat..
2021-02-09 23:53:29 1000
2021-02-09 23:53:32 2000
2021-02-09 23:53:35 3000
2021-02-09 23:53:38 4000
2021-02-09 23:53:41 5000
2021-02-09 23:53:43 6000
2021-02-09 23:53:47 7000
2021-02-09 23:53:50 8000
2021-02-09 23:53:53 9000
2021-02-09 23:53:56 10000
2021-02-09 23:53:59 11000
2021-02-09 23:54:02 12000
2021-02-09 23:54:05 13000
2021-02-09 23:54:08 14000
2021-02-09 23:54:11 15000
2021-02-09 23:54:15 16000
2021-02-09 23:54:18 17000
2021-02-09 23:54:21 18000
2021-02-09 23:54:24 19000
2021-02-09 23:54:26 20000
2021-02-09 23:54:29 21000
2021-02-09 23:54:32 22000
2021-02-09 23:54:36 23000
2021-02-09 23:54:40 24000
2021-02-09 23:54:44 25000
2021-02-09 23:54:48 26000
2021-02-09 23:54:51 27000
2021-02-09 23:54:54 28000
2021-02-09 23:54:57 29000
2021-02-09 23:55:00 30000
2021-02-09 23:55:02 31000
2021-02-09 23:55:05 32000
2021-02-09 23:55:08 33000
2021-02-09 23:55:10 34000
2021-02-09 23:55:14 35000
2021-02-09 23:55:17 36000
2021-02-09 23:55:19 37000
2021-0

FileNotFoundError: [Errno 2] No such file or directory: './output/feature/w2v/w2v_21w.csv'

In [15]:
# coding=utf-8
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score


############################ 定义评估函数 ############################
def micro_avg_f1(preds, dtrain):
    print(preds)
    y_true = dtrain.get_label()
    return 'micro_avg_f1', f1_score(y_true, preds, average='micro')

# def micro_avg_f1(y_true, y_pred):
#     return f1_score(y_true, y_pred, average='micro')
# def micro_avg_f1(preds, dtrain):
#     y_true = dtrain.get_label()
#     return f1_score(y_true, preds, average='micro')


############################ 加载特征 & 标签 ############################
df_tfidf_lr = pd.read_csv('./output/feature/tfidf/lr_prob_21w.csv')
df_tfidf_bnb = pd.read_csv('./output/feature/tfidf/bnb_prob_21w.csv')
df_tfidf_mnb = pd.read_csv('./output/feature/tfidf/mnb_prob_21w.csv')
df_dbow_nn = pd.read_csv('./output/feature/dbowd2v/nn_prob_21w.csv')
df_dm_nn = pd.read_csv('./output/feature/dmd2v/nn_prob_21w.csv')
df_w2v = pd.read_csv('./output/feature/w2v/w2v_21w.csv')

df_lb = pd.read_csv('./data/processed_train.csv', usecols=['ID', 'Gender'])
print(df_lb.head())

                                 ID  Gender
0  22DD920316420BE2DF8D6EE651BA174B       1
1  43CC3AF5A8D6430A3B572337A889AFE4       1
2  E97654BFF5570E2CCD433EA6128EAC19       1
3  6931EFC26D229CCFCEA125D3F3C21E57       2
4  E780470C3BB0D340334BD08CDCC3C71A       2


In [23]:
############################ xgboost ############################
tr_num = param.train_num
df_sub = pd.DataFrame()
df_sub['ID'] = df_lb.iloc[tr_num:]['ID']
seed = param.seed

# n_trees = 1086  ##### ! #####
n_trees = 10
# esr = 100
evals = 10

df = pd.concat([df_tfidf_lr, df_tfidf_bnb, df_tfidf_mnb, df_dbow_nn, df_w2v], axis=1)
print(df.columns)
num_class = len(pd.value_counts(df_lb['Gender']))
x = df.iloc[:tr_num]
y = df_lb['Gender'][:tr_num].astype(int)
x_te = df.iloc[tr_num:]
y_te = df_lb['Gender'][tr_num:].astype(int)

print(x.shape)
print(y.shape)
print(y.unique())
print(y_te.unique())

max_depth = 7
min_child_weight = 1
subsample = 0.8
colsample_bytree = 0.8
gamma = 1
lam = 0

params = {
    'objective': 'multi:softmax',
#      'objective': 'multi:softprob',
    'booster': 'gbtree',
    'stratified': True,
    'num_class': num_class,
    'max_depth': max_depth,
    'min_child_weight': min_child_weight,
    'subsample': subsample,
    'colsample_bytree': colsample_bytree,
#     'gamma': gamma,
#     'lambda': lam,
    
    'eta': 0.02,
    'silent': 1,
    'seed': seed,
}

Index(['tfidf_lr_0', 'tfidf_lr_1', 'tfidf_lr_2', 'tfidf_bnb_0', 'tfidf_bnb_1',
       'tfidf_bnb_2', 'tfidf_mnb_0', 'tfidf_mnb_1', 'tfidf_mnb_2', 'dbowd2v_0',
       ...
       'w2v_290', 'w2v_291', 'w2v_292', 'w2v_293', 'w2v_294', 'w2v_295',
       'w2v_296', 'w2v_297', 'w2v_298', 'w2v_299'],
      dtype='object', length=312)
(75000, 312)
(75000,)
[1 2 0]
[1 2 0]


0        1
1        1
2        1
3        2
4        2
        ..
74995    1
74996    2
74997    2
74998    2
74999    2
Name: Gender, Length: 75000, dtype: int64

In [19]:
util.log('start to train xgb...')
dtrain = xgb.DMatrix(x, label=y)
dtest = xgb.DMatrix(x_te)
watchlist = [(dtrain, 'train')]
# bst = xgb.train(params, dtrain, n_trees, evals=watchlist, feval=micro_avg_f1, maximize=True,
#                 verbose_eval=evals)
bst = xgb.train(params, dtrain, n_trees, evals=watchlist, maximize=True,
                verbose_eval=evals)
# df_sub['Gender'] = bst.predict(dtest).astype(int)
# df_sub['ID'] = df_sub['ID'].astype(str)
# df_sub.to_json(param.data_path + '/output/result/1209-xgb-tfidf_lr_bnb_mnb+amt+dbowd2v_nn+w2v-r' + str(n_trees) + '.json', orient='records', lines=True)

2021-02-11 08:58:01 start to train xgb...
Parameters: { silent, stratified } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mlogloss:1.08118
[9]	train-mlogloss:0.94817


ValueError: Wrong number of items passed 3, placement implies 1

In [None]:
micro_avg_f1(y_te, df_sub['Gender'])

In [None]:
# pred = bst.predict(xg_test)
# error_rate = np.sum(pred != test_Y) / test_Y.shape[0]
# print('Test error using softmax = {}'.format(error_rate))