In [1]:
import pandas as pd
import numpy as np
import gc; gc.enable()
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import scipy
import lightgbm as lgb
from sklearn.decomposition import NMF
from copy import deepcopy as cp

%env JOBLIB_TEMP_FOLDER=temp

env: JOBLIB_TEMP_FOLDER=temp


In [2]:
usecols = ['activation_date', 'description']

In [3]:
train = pd.read_csv('data/train.csv', parse_dates=['activation_date'], usecols=usecols).sort_values(['activation_date']).reset_index(drop=True)
test = pd.read_csv('data/test.csv', parse_dates=['activation_date'], usecols=usecols)
train.head()

Unnamed: 0,description,activation_date
0,Зимний комбенизон для малыша,2017-03-15
1,Твёрдый пластик,2017-03-15
2,,2017-03-15
3,"Продам, цена указана за все что на фото!/\nВсе...",2017-03-15
4,"Спортивный костюм. Размер 48 - 50, мало б/у. Н...",2017-03-15


In [4]:
train_len = len(train)

In [5]:
for df in [train, test]:
    df['description'].fillna('unknowndescription', inplace=True)

In [6]:
from nltk.corpus import stopwords                
from nltk.stem.snowball import RussianStemmer
stemmer = RussianStemmer(ignore_stopwords=False)
def clean_text(txt):
    words = str(txt).lower().strip().split(" \t\r.,!?^+-*/@~:;/\\\"\'&{}[]()#$%") #str(txt).split(" ") #
    words = [stemmer.stem(wrd) for wrd in words \
                if wrd not in stopwords.words('russian') and len(wrd) > 1]
    txt = u" ".join(words)
    return txt

train['description'] = train['description'].apply(clean_text); print('train description clean text done') 
test['description'] = test['description'].apply(clean_text); print('test description clean text done')  

train description clean text done
test description clean text done


In [8]:
'''
count_vectorizer_desc = CountVectorizer(max_df=0.95, min_df=2,
                                        max_features=15000,
                                        stop_words=stopwords.words('russian'))
'''
count_vectorizer_desc = TfidfVectorizer(stop_words=stopwords.words('russian'), 
                max_df=0.95, min_df=2,
                smooth_idf=False,
                sublinear_tf=True,
                max_features=15000)


desc_counts = count_vectorizer_desc.fit_transform(train['description'].append(test['description']))

train_desc_counts = desc_counts[:train_len]
test_desc_counts = desc_counts[train_len:]

del train, test; gc.collect()

25

In [9]:
from sklearn.model_selection import ParameterGrid

In [10]:
nmf = NMF(n_components=10, solver='mu', random_state=411)
nmf_features = nmf.fit_transform(desc_counts)

In [11]:
nmf_features.shape

(2011862, 10)

In [12]:
nmf_features[:3,:]

array([[  0.00000000e+00,   3.05850260e-04,   2.36557387e-04,
          0.00000000e+00,   0.00000000e+00,   1.50118352e-14,
          2.26818574e-04,   0.00000000e+00,   0.00000000e+00,
          4.61121312e-04],
       [  0.00000000e+00,   7.43872466e-55,   2.22401801e-05,
          0.00000000e+00,   3.99799082e-09,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.56303215e-09,
          9.67677640e-04],
       [  5.41536183e-02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00]])

In [13]:
import pickle
with open('nmf_features.pickle', 'wb') as handle:
    pickle.dump(nmf_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [15]:
n_top_words = 30
tf_feature_names = count_vectorizer_desc.get_feature_names()
print_top_words(nmf, tf_feature_names, n_top_words)

Topic #0: unknowndescription ящиком качественными ква кв кб каши кашемировое кашемир каша качеству качеством качество качественных качественным квадратная качественный качественные качественную качественной качественное качественного качественно качественная качествен качестве качества качеств квадрат квадратной
Topic #1: продам состоянии платье идеальном куртку мальчика новую срочно хорошем новый цена девочку рабочем девочки костюм коляску телефон туфли комбинезон пальто велосипед очень осень весна новое рост цвет детскую лет года
Topic #2: состояние отличное хорошее идеальное хорош отличн очень нового носили новой качество комплект новых кожа пару фото рост мало осень работает натуральная идеальн мальчика куртка цвет продаю полный весна одевали девочки
Topic #3: состоян отличное хорошее идеальное отл пальто сапоги кроссовки туфли мальчика платье девочку ботинки весна бу джинсы кожа осень костюм куртку хор куртка девочки нормальное натуральная рабочее хорошом босоножки сапожки хорошим

## English version:

Topic #0: unknowndescription box quality kv kv kb porridge cashmere cashmere porridge quality quality quality quality qualitative square quality high-quality quality high-quality quality high-quality quality quality quality quality qualities square square

Topic #1: selling state perfect jacket boy new urgently good new price girl working girl costume pushchair phone shoes jumpsuit bicycle very autumn spring new growth color baby years of the year

Topic #2: condition excellent good perfect good excellent new very worn new quality set new leather couple photo growth little autumn works natural perfect boy jacket color sell full spring dressed girls

Topic #3: good excellent ideal coat coat boots sneakers shoes boy dress girl shoes spring bu jeans skin autumn suit jacket chorus jacket girls normal natural working well sandals boots good

Topic #4: good condition very sell jacket car sold bicycle running car stroller car phone worn auto sofa things set little self-organizing season year table baby watch bed questions technical bag kit box

Topic #5: excellent condition sell jacket very growth couple worn little overalls dressed boy kit autumn suit years car times sold cm jeans girls set coat used running season car spring

Topic #6: size new cm 44 42 46 dress leather 40 38 48 approached 37 natural shoes very color 36 insole new 39 new 50 length boots sneakers jacket boots sell jeans

Topic #7: new absolutely practical dress absolutely shoes leather bag jacket skirt jeans natural set pants suit female sneakers dressed cap black boots boots 37 sneakers sweater bike cloak coat sandals vest vest

Topic #8: bargain possible relevant relevant issues phone sell small urgently exchange is sold to a real buyer phone inspection car new car auto running new way reasonable hood engine plot rubber good requires a box

Topic #9: price apartment new for sale cm house rub sell apartment apartment close to 10 repair windows very new house call home kitchen pc plot ul school 100 rooms years shops have a city