In [1]:
import pickle
import numpy as np
import gc; gc.enable()   
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor as thread_pool
import itertools
from copy import deepcopy as cp
from scipy.sparse import hstack, csr_matrix
import os

class CustomCounterVectorizer():
    def __init__(self, save_folder, norm=True, **counter_vectorizer_params):
        self.save_folder = save_folder
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
    
        self.default_params = counter_vectorizer_params
        self.norm = norm
        self.min_df = counter_vectorizer_params['min_df'] if 'min_df' in counter_vectorizer_params else 1
        self.max_features = counter_vectorizer_params['max_features'] if 'max_features' in counter_vectorizer_params else 2**20
        
        print('Params: \nnorm =', self.norm, '\nmin_df =', self.min_df, '\nmax_features =', self.max_features)
        print('CounterVectorizer Params:\n', counter_vectorizer_params)
        print('')
    
    def collect_vocab(self, X):
        cv = CountVectorizer(**self.default_params)
        self.analyze_func = cv.build_analyzer()
        self.voc_pool = set()
        
        '''
        cpu_count = 4
        with thread_pool(max_workers=cpu_count) as executor:
            for i, res in tqdm(enumerate(executor.map(self.analyze_func, X))):
                self.voc_pool.update(res)
        '''
        for x in X:
            self.voc_pool.update(self.analyze_func(x))
        
        print('Vocabulary collection done. Vocabulary size =', len(self.voc_pool))  
        print('\n')
        #print(X, '\n\tafter analyzed ==>', self.voc_pool)
    
    def counter_vectorize(self, data):
        X = data[0]
        vocs = data[1]
        params = cp(self.default_params)
        params.update({'vocabulary': list(vocs)})
        cv = CountVectorizer(**params)
        res = cv.fit_transform(X)
        return cv, res
        
    def minibatch_counter_vectorize(self, X):
        
        chunksize = 2**20
        self.voc_pool = list(self.voc_pool)
        voc_chunks = [self.voc_pool[i*chunksize: (i+1)*chunksize] for i in range(len(self.voc_pool)//chunksize + 1)] 
        print('Total batches to execute:', len(voc_chunks))
        
        cpu_count = 4
        for i, voc_chunk in enumerate(voc_chunks):
            #if i == 0:
            #    continue
                
            cv, res = self.counter_vectorize((X, voc_chunk))
            processed_feature_names = cv.get_feature_names()
            cond_filter = np.array(np.clip(res.getnnz(axis=0) - self.min_df, 0, 1), dtype=bool)
            remaining_feature_names = np.array(processed_feature_names)[cond_filter]
            res = res[:, cond_filter]; gc.collect()
            print('Processed features #:', len(processed_feature_names), ', Remaining features #: ', len(remaining_feature_names), '\n')
    
            if res is None:
                print('No results remain!')
                continue
            
            with open('{}text_features_{}.pickle'.format(self.save_folder, i+1), 'wb') as handle:
                pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)
                print('Features Saved.')
            
            with open('{}text_feature_names_{}.pickle'.format(self.save_folder, i+1), 'wb') as handle:
                pickle.dump(remaining_feature_names, handle, protocol=pickle.HIGHEST_PROTOCOL)
                print('Feature Names Saved.')
                
            del res, processed_feature_names, cond_filter, remaining_feature_names
            gc.collect()
    
                
    def fit_transform(self, X, y=None):
        self.collect_vocab(X)
        self.minibatch_counter_vectorize(X)
        
        chunksize = 2**20
        self.voc_pool = list(self.voc_pool)
        voc_chunks = [self.voc_pool[i*chunksize: (i+1)*chunksize] for i in range(len(self.voc_pool)//chunksize + 1)] 
        print('Total batches to execute:', len(voc_chunks))
        
        cpu_count = 4
        final_res = None
        for i, _ in enumerate(voc_chunks):
            print('processing', i)
            with open('{}text_features_{}.pickle'.format(self.save_folder, i+1), 'rb') as handle:
                if i == 0:
                    final_res = pickle.load(handle)
                else:
                    old_final_res = final_res
                    final_res = hstack([final_res, pickle.load(handle)]).tocsr()
                    del old_final_res; gc.collect()
                    cond_filter = np.array(np.clip(final_res.getnnz(axis=0) - self.min_df, 0, 1), dtype=bool)
                    final_res = final_res[:, cond_filter]; gc.collect()
                    
                while final_res.shape[1] > self.max_features:
                    self.min_df += 1
                    print('Updated min df =', self.min_df)
                    cond_filter = np.array(np.clip(final_res.getnnz(axis=0) - self.min_df, 0, 1), dtype=bool)
                    final_res = final_res[:, cond_filter]; gc.collect()
                        
        print('Final shape:', final_res.shape)
        with open('{}text_features_all.pickle'.format(self.save_folder), 'wb') as handle:
            pickle.dump(final_res, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print('All Features Merged and Saved.')
        del final_res; gc.collect()

params = {
    'ngram_range': (2, 4), 
    'analyzer': 'char_wb',
    'min_df': 5,
    'max_df': .85,
    'max_features': 2**18,
    'lowercase': True,
    'dtype': np.float32    
}

with open('desc_text_feature.pickle', 'rb') as handle:
    text_feature = pickle.load(handle)
ccv = CustomCounterVectorizer('0530_CV_CHAR_WB_NGRAM_24/', norm=True, **params)
ccv.fit_transform(text_feature)

Params: 
norm = True 
min_df = 5 
max_features = 262144
CounterVectorizer Params:
 {'ngram_range': (2, 4), 'analyzer': 'char_wb', 'min_df': 5, 'max_df': 0.85, 'max_features': 262144, 'lowercase': True, 'dtype': <class 'numpy.float32'>}

Vocabulary collection done. Vocabulary size = 1302267


Total batches to execute: 2
Processed features #: 1048576 , Remaining features #:  345792 

Features Saved.
Feature Names Saved.
Processed features #: 253691 , Remaining features #:  83595 

Features Saved.
Feature Names Saved.
Total batches to execute: 2
processing 0
Updated min df = 6
Updated min df = 7
Updated min df = 8
Updated min df = 9
Updated min df = 10
processing 1
Updated min df = 11
Updated min df = 12
Updated min df = 13
Updated min df = 14
Updated min df = 15
Final shape: (2011862, 261585)
All Features Merged and Saved.


In [1]:
comm_feature = 'all_features.pickle'
text_feature = 'CV_CHAR_WB_NGRAM_24/text_features_all.pickle'

In [2]:
import pickle
import pandas as pd
import numpy as np
import gc; gc.enable()
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix, hstack, save_npz
from sklearn.preprocessing import normalize

In [3]:
with open(comm_feature, 'rb') as handle1:
    comm_feature = pickle.load(handle1).astype(np.float32)
    print(type(comm_feature))
    
with open(text_feature, 'rb') as handle2:
    text_feature = pickle.load(handle2)
    text_feature = normalize(text_feature)
    print(type(text_feature))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>


In [4]:
all_features = hstack([comm_feature, text_feature]).tocsr()

In [5]:
all_features = all_features.astype(np.float64)

In [6]:
all_features.shape

(2011862, 290474)

In [7]:
save_npz('all_features_cv_charwb24.npz', all_features)