In [None]:
import pandas as pd
import numpy as np
import gc; gc.enable()
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle
from scipy.sparse import csr_matrix, hstack

import nltk; nltk.download("stopwords")
from nltk.corpus import stopwords                
from nltk.stem.snowball import RussianStemmer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

In [None]:
files = ['data/train.csv', 'data/test.csv', 'data/train_active.csv', 'data/test_active.csv']
train_path = 'data/train.csv'
test_path = 'data/train.csv'

In [None]:
train_len = pd.read_csv(train_path, usecols=['item_id']).shape[0]
test_len = pd.read_csv(test_path, usecols=['item_id']).shape[0]

In [None]:
stemmer = RussianStemmer(ignore_stopwords=False)
def clean_text(txt):
    txt = str(txt).lower().strip().split(" \t\r.,!?^+-*/@~:;/\\\"\'&{}[]()#$%") #str(txt).split(" ") #
    txt = [stemmer.stem(wrd) for wrd in txt \
                if wrd not in stopwords.words('russian') and len(wrd) > 1]
    txt = u" ".join(txt)
    return txt

In [None]:
def process_chunk(df, is_title):
    if is_title:
        df['title'].fillna('unknowntitle', inplace=True)
    else:
        df['description'].fillna('unknowndescription', inplace=True)
        print('text cleaning!')
        df['description'] = [clean_text(text) for text in tqdm(df['description'].values)]
        
    return df

In [None]:
def text_generator(is_title, is_partial=False):
    for f in files:
        print('Processing file:', f)

        if is_title:
            target_col = 'title'
        else:
            target_col = 'description'

        usecols = [target_col, 'activation_date']
        for chunk in pd.read_csv(f, usecols=usecols, chunksize=2000000, parse_dates=['activation_date']):
            if f == train_path:
                chunk = chunk.sort_values('activation_date').reset_index(drop=True)
                print('Index reset!')

            chunk = chunk.drop('activation_date', axis=1)   
            chunk = process_chunk(chunk, is_title); gc.collect()

            for s in tqdm(chunk[target_col].values):
                yield s    
                
            del chunk; gc.collect()
            
            if is_partial and f == test_path:
                break

In [None]:
count_vectorizer_title = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True, min_df=25)
count_vectorizer_title.fit(text_generator(is_title=True))
title_feature = count_vectorizer_title.transform(text_generator(is_title=True, is_partial=True))

with open('complete_title_count_vec.pickle', 'wb') as handle:
    pickle.dump(title_feature, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('title text features saved')
    del title_feature; gc.collect()

In [None]:
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2),
                            lowercase=True,
                            analyzer='word',
                            smooth_idf=True,
                            sublinear_tf=True,
                            max_features=30000,
                            max_df=0.9, stop_words=stopwords.words('russian'),
                            norm='l2')
tfidf_vec.fit(text_generator(is_title=False))
desc_tf_idf_features = tfidf_vec.transform(text_generator(is_title=False, is_partial=True))

print('TFIDF transformation done.')
with open('complete_desc_tfidf_vec.pickle', 'wb') as handle:
    pickle.dump(desc_tf_idf_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('desc text features saved')
    del desc_tf_idf_features; gc.collect()

In [None]:
all_text_sparse = hstack([
    csr_matrix(normalize(pickle.load(open('complete_title_count_vec.pickle', 'rb')), norm='l2', axis=1)),
    csr_matrix(pickle.load(open('complete_desc_tfidf_vec.pickle', 'rb')))
]).tocsr()

with open('train_complete_text_sparse_vec.pickle', 'wb') as handle:
    pickle.dump(all_text_sparse[:train_len,:], handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('test_complete_text_sparse_vec.pickle', 'wb') as handle:
    pickle.dump(all_text_sparse[train_len:,:], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
all_text_dense = TruncatedSVD(n_components=150).fit_tranform(all_text_sparse)

with open('train_complete_text_dense_vec.pickle', 'wb') as handle:
    pickle.dump(all_text_dense[:train_len,:], handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('test_complete_text_dense_vec.pickle', 'wb') as handle:
    pickle.dump(all_text_dense[train_len:,:], handle, protocol=pickle.HIGHEST_PROTOCOL)