# 本文件说明

- nlp 文本特征

# 基本设置

In [1]:
import jieba
import os

%matplotlib inline
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from toolkits.setup.specific_func import set_ch_pd
set_ch_pd()

# load data

In [2]:
from sklearn.datasets import fetch_20newsgroups

#all categories
#newsgroup_train = fetch_20newsgroups(subset='train')
#part categories
categories = ['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x']
newsgroup_train = fetch_20newsgroups(subset = 'train',categories = categories)
newsgroup_test = fetch_20newsgroups(subset = 'test',categories = categories)

# 文本特征

## Method 1. HashingVectorizer
- 规定feature个数
- 因为只取了10000个词，即10000维feature，稀疏度还不算低。而实际上用TfidfVectorizer统计可得到上万维的feature，统计的全部样本是13w多维，就是一个相当稀疏的矩阵了。

In [3]:
#newsgroup_train.data is the original documents, but we need to extract the 
#feature vectors inorder to model the text data
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(stop_words = 'english',non_negative = True,
                               n_features = 10000)
fea_train = vectorizer.fit_transform(newsgroup_train.data)
fea_test = vectorizer.fit_transform(newsgroup_test.data)
 
#return feature vector 'fea_train' [n_samples,n_features]
print('Size of fea_train:' + repr(fea_train.shape))
print('Size of fea_train:' + repr(fea_test.shape))
#11314 documents, 130107 vectors for all categories
print('The average feature sparsity is {0:.3f}%'.format(fea_train.nnz/float(fea_train.shape[0]*fea_train.shape[1])*100))



Size of fea_train:(2936, 10000)
Size of fea_train:(1955, 10000)
The average feature sparsity is 1.002%


## Method 2. CountVectorizer+TfidfTransformer

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5)
counts_train = count_v1.fit_transform(newsgroup_train.data)
print("the shape of train is "+repr(counts_train.shape))
 
count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_) # 让两个CountVectorizer共享vocabulary
counts_test = count_v2.fit_transform(newsgroup_test.data)
print("the shape of test is "+repr(counts_test.shape))
 
tfidftransformer = TfidfTransformer()
 
tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
tfidf_test = tfidftransformer.fit(counts_test).transform(counts_test)

the shape of train is (2936, 66432)
the shape of test is (1955, 66432)


# 保存本文件

In [None]:
if 1:
    import os
    import datetime as dt
    
    def output_HTML(read_file, output_file):
        from nbconvert import HTMLExporter
        import codecs
        import nbformat
        exporter = HTMLExporter()
        # read_file is '.ipynb', output_file is '.html'
        output_notebook = nbformat.read(read_file, as_version=4)
        output, resources = exporter.from_notebook_node(output_notebook)
        codecs.open(output_file, 'w', encoding='utf-8').write(output)

    html_file_folder = 'html_files'
    if not os.path.exists(html_file_folder):
        os.makedirs(html_file_folder)

    today = dt.datetime.now().strftime('%Y%m%d')
    current_file = '8_nlp.ipynb'
    output_file = 'html_files\%s_%s.html'%(os.path.splitext(current_file)[0], today)
    output_HTML(current_file, output_file)