In [None]:
# -*- coding: utf-8 -*-

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import time

t_start = time.time()

# 只用词来作特征
df_train = pd.read_csv('C:/daguan_text_classification/new_data/train_set.csv')
df_train.drop(columns='article', inplace=True)
df_test = pd.read_csv('C:/daguan_text_classification/new_data/test_set.csv')
df_test.drop(columns='article', inplace=True)
f_all = pd.concat(objs=[df_train, df_test], axis=0, sort=True)
y_train = (df_train['class'] - 1).values

#tfidf transformation
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, sublinear_tf=True)
vectorizer.fit(df_train['word_seg'])
x_train = vectorizer.transform(df_train['word_seg'])
x_test = vectorizer.transform(df_test['word_seg'])

data = (x_train, y_train, x_test)
fp = open('./data_tfidf.pkl', 'wb')
pickle.dump(data, fp)
fp.close()

t_end = time.time()
print("已将原始数据数字化为tfidf特征，共耗时：{}min".format((t_end-t_start)/60))

LSA features

In [None]:

from sklearn.decomposition import TruncatedSVD
import pickle
import time

t_start = time.time()

"""读取tfidf特征"""
tfidf_path = './data_tfidf_selected_lsvc_l2_143w.pkl'
f_tfidf = open(tfidf_path, 'rb')
x_train, y_train, x_test = pickle.load(f_tfidf)
f_tfidf.close()

"""特征降维：lsa"""
print("lsa......")
lsa = TruncatedSVD(n_components=200)
x_train = lsa.fit_transform(x_train)
x_test = lsa.transform(x_test)


data = (x_train, y_train, x_test)
f_data = open('./data_s_lsvc_l2_143w_lsa.pkl', 'wb')
pickle.dump(data, f_data)
f_data.close()

t_end = time.time()
print("lsa特征完成，共耗时：{}min".format((t_end-t_start)/60))

LDA features

In [None]:
# -*- coding: utf-8 -*-

#@brief : 将tf特征降维为lda特征，并将结果保存至本地


from sklearn.decomposition import LatentDirichletAllocation
import pickle
import time

t_start = time.time()

"""=====================================================================================================================
1 tf特征加载
"""
tf_path = './tf_select_LSVC_l2644235.pkl'
f_tf = open(tf_path, 'rb')
x_train, y_train, x_test = pickle.load(f_tf)
f_tf.close()

"""=====================================================================================================================
2 特征降维：lda
"""
print("lda......")
lda = LatentDirichletAllocation(n_components=200)
x_train = lda.fit_transform(x_train)
x_test = lda.transform(x_test)

"""=====================================================================================================================
3 将lda特征保存至本地
"""
data = (x_train, y_train, x_test)
f_data = open('./data_lda.pkl', 'wb')
pickle.dump(data, f_data)
f_data.close()

t_end = time.time()
print("lda特征完成，共耗时：{}min".format((t_end-t_start)/60))


NMF features

In [None]:
# -*- coding: utf-8 -*-

#@brief : 将tfidf特征降维为nmf特征，并将结果保存至本地

from sklearn.decomposition import NMF
import pickle
import time

t_start = time.time()

"""读取tfidf特征"""
tfidf_path = './word_seg_tfidf_(1, 3)-2036592-616882-192632-62375.pkl'
f_tfidf = open(tfidf_path, 'rb')
x_train, y_train, x_test = pickle.load(f_tfidf)
f_tfidf.close()
"""特征降维：nmf"""
#print("nmf......")
num_features = 200
nmf = NMF(n_components=num_features)
x_train = nmf.fit_transform(x_train)
x_test = nmf.transform(x_test)

"""将lsa特征保存至本地"""
data = (x_train, y_train, x_test)
data_path = tfidf_path[:-4] + '-nmf.pkl'
f_data = open(data_path, 'wb')
pickle.dump(data, f_data)
f_data.close()

t_end = time.time()
#print("nmf特征完成，共耗时：{}min".format((t_end-t_start)/60))


In [None]:
# -*- coding: utf-8 -*-
"""
@brief : 将原始数据数字化为doc2vec特征，并将结果保存至本地
@author: Jian
"""
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import time
import pickle

t_start = time.time()

"""=====================================================================================================================
0 辅助函数 
"""


def sentence2list(sentence):
    s_list = sentence.strip().split()
    return s_list


"""=====================================================================================================================
1 读取原始数据，并进行简单处理
"""
df_train = pd.read_csv('../data/train_set.csv')
df_train.drop(columns='article', inplace=True)
df_test = pd.read_csv('../data/test_set.csv')
df_test.drop(columns='article', inplace=True)
df_all = pd.concat(objs=[df_train, df_test], axis=0, sort=True)
y_train = (df_train['class'] - 1).values

df_all['word_list'] = df_all['word_seg'].apply(sentence2list)
texts = df_all['word_list'].tolist()

"""=====================================================================================================================
2 doc2vec
"""
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
model = Doc2Vec(documents, vector_size=200, window=5, min_count=3, workers=4, epochs=25)
docvecs = model.docvecs

x_train = []
for i in range(0, 102277):
    x_train.append(docvecs[i])
x_train = np.array(x_train)

x_test = []
for j in range(102277, 204554):
    x_test.append(docvecs[j])
x_test = np.array(x_test)

"""=====================================================================================================================
3 将doc2vec特征保存至本地
"""
data = (x_train, y_train, x_test)
f_data = open('./data_doc2vec_25.pkl', 'wb')
pickle.dump(data, f_data)
f_data.close()

t_end = time.time()
print("已将原始数据数字化为doc2vec特征，共耗时：{}min".format((t_end-t_start)/60))

lda + lsa + doc2vec 特征

In [None]:
# -*- coding: utf-8 -*-

#@brief : lda/lsa/doc2vec三种特征进行特征融合，并将结果保存至本地

import numpy as np
import pickle
import time

t_start = time.time()

"""=====================================================================================================================
2 读取lda/lsa/doc2vec特征，并对这三种特征进行拼接融合
"""
f1 = open('./data_lda.pkl', 'rb')
x_train_1, y_train, x_test_1 = pickle.load(f1)
f1.close()

f2 = open('./data_s_lsvc_l2_143w_lsa.pkl', 'rb')
x_train_2, y_train, x_test_2 = pickle.load(f2)
f2.close()

f3 = open('./data_doc2vec_25.pkl', 'rb')
x_train_3, _, x_test_3 = pickle.load(f3)
f3.close()

x_train = np.concatenate((x_train_1, x_train_2, x_train_3), axis=1)
x_test = np.concatenate((x_test_1, x_test_2, x_test_3), axis=1)

"""=====================================================================================================================
2 将融合后的特征，保存至本地
"""
data = (x_train, y_train, x_test)
fp = open('./data_ensemble.pkl', 'wb')
pickle.dump(data, fp)
fp.close()

t_end = time.time()
print("已将原始数据数字化为融合的特征，共耗时：{}min".format((t_end-t_start)/60))

In [None]:
# -*- coding: utf-8 -*-

#@简介：将data_ensemble特征转换为稀疏矩阵，并将其合并到tfidf

import pickle
from scipy import sparse
from scipy.sparse import hstack

"""读取ensemble特征"""
f_ensemble = open('./data_ensemble.pkl', 'rb')
x_train_ens, y_train, x_test_ens = pickle.load(f_ensemble)
f_ensemble.close()

"""将numpy 数组 转换为 csr稀疏矩阵"""
x_train_ens_s = sparse.csr_matrix(x_train_ens)
x_test_ens_s = sparse.csc_matrix(x_test_ens)

"""读取tfidf特征"""
f_tfidf = open('./data_tfidf_select_LSVC_l2_17107.pkl', 'rb')
x_train_tfidf, _, x_test_tfidf = pickle.load(f_tfidf)
f_tfidf.close()

"""对两个稀疏矩阵进行合并"""
x_train_spar = hstack([x_train_ens_s, x_train_tfidf])
x_test_spar = hstack([x_test_ens_s, x_test_tfidf])

"""将合并后的稀疏特征保存至本地"""
data = (x_train_spar, y_train, x_test_spar)
f = open('./data_ensemble_spar.pkl', 'wb')
pickle.dump(data, f)
f.close()

In [None]:
# -*- coding: utf-8 -*-

#@简介：根据已有的特征，使用多项式方法构造出更多特征

import pickle
import time
from sklearn.preprocessing import PolynomialFeatures

t_start = time.time()

"""读取原特征"""
features_path = './data_s_lsvc_l2_143w_lsa.pkl'
f = open(features_path, 'rb')
x_train, y_train, x_test = pickle.load(f)
f.close()

"""使用多项式方法构造出更多的特征"""
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)#degree控制多项式最高次数
x_train_new = poly.fit_transform(x_train)
x_test_new = poly.transform(x_test)

"""将构造好的特征保存至本地"""
data = (x_train_new, y_train,  x_test_new)
features_constr_path = features_path.split('/')[-1] + '_constr.pkl'
f_data = open(features_constr_path, 'wb')
pickle.dump(data, f_data)
f_data.close()

t_end = time.time()
print("构造特征完成，共耗时：{}min".format((t_end-t_start)/60))


In [None]:
# -*- coding: utf-8 -*-

#@简介：对特征进行嵌入式选择

import time
import pickle
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

t_start = time.time()

"""读取特征"""
features_path = './data_tfidf_select_LSVC_l2_901288_select_LSVC_l2_279950.pkl'#tfidf特征的路径
fp = open(features_path, 'rb')
x_train, y_train, x_test = pickle.load(fp)
fp.close()

"""进行特征选择"""
alo_name = 'LSVC_l2'
lsvc = LinearSVC(penalty='l2', C=1.0, dual=True).fit(x_train, y_train)
slt = SelectFromModel(lsvc, prefit=True)
x_train_s = slt.transform(x_train)
x_test_s = slt.transform(x_test)

"""保存选择后的特征至本地"""
num_features = x_train_s.shape[1]
data_path = './' + features_path.split('.')[-2] + '_select_' + alo_name + '_' + str(num_features) + '.pkl'
data_f = open(data_path, 'wb') 
pickle.dump((x_train_s, y_train, x_test_s), data_f)
data_f.close()

t_end = time.time()
print("特征选择完成，选择{}个特征，共耗时{}min".format(num_features, (t_end-t_start)/60))