In [15]:
import pandas as pd
import numpy as np
import gc; gc.enable()
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import scipy
import lightgbm as lgb
from sklearn.decomposition import LatentDirichletAllocation
from copy import deepcopy as cp

%env JOBLIB_TEMP_FOLDER=temp

env: JOBLIB_TEMP_FOLDER=temp


In [2]:
usecols = ['activation_date', 'description']

In [3]:
train = pd.read_csv('data/train.csv', parse_dates=['activation_date'], usecols=usecols).sort_values(['activation_date']).reset_index(drop=True)
test = pd.read_csv('data/test.csv', parse_dates=['activation_date'], usecols=usecols)
train.head()

Unnamed: 0,description,activation_date
0,Зимний комбенизон для малыша,2017-03-15
1,Твёрдый пластик,2017-03-15
2,,2017-03-15
3,"Продам, цена указана за все что на фото!/\nВсе...",2017-03-15
4,"Спортивный костюм. Размер 48 - 50, мало б/у. Н...",2017-03-15


In [4]:
train_len = len(train)

In [5]:
for df in [train, test]:
    df['description'].fillna('unknowndescription', inplace=True)

In [6]:
from nltk.corpus import stopwords                
from nltk.stem.snowball import RussianStemmer
stemmer = RussianStemmer(ignore_stopwords=False)
def clean_text(txt):
    words = str(txt).lower().strip().split(" \t\r.,!?^+-*/@~:;/\\\"\'&{}[]()#$%") #str(txt).split(" ") #
    words = [stemmer.stem(wrd) for wrd in words \
                if wrd not in stopwords.words('russian') and len(wrd) > 1]
    txt = u" ".join(words)
    return txt

train['description'] = train['description'].apply(clean_text); print('train description clean text done') 
test['description'] = test['description'].apply(clean_text); print('test description clean text done')  

train description clean text done
test description clean text done


In [7]:
count_vectorizer_desc = CountVectorizer(max_df=0.95, min_df=2,
                                        max_features=15000,
                                        stop_words=stopwords.words('russian'))
'''
TfidfVectorizer(stop_words=stopwords.words('russian'), 
                                        sublinear_tf=True,
                                        max_features=15000)
'''

desc_counts = count_vectorizer_desc.fit_transform(train['description'].append(test['description']))

train_desc_counts = desc_counts[:train_len]
test_desc_counts = desc_counts[train_len:]

del train, test; gc.collect()

25

In [8]:
from sklearn.model_selection import ParameterGrid

In [9]:
def fit_and_try(default_params, try_params):
    params_list = ParameterGrid(try_params)

    for i, param in enumerate(params_list):
        used_param = cp(default_params)
        used_param.update(param)

        lda = LatentDirichletAllocation(**used_param)

        lda.fit(train_desc_counts)
        print(str(param), '==========> perplexity =', lda.perplexity(test_desc_counts))
        
        del lda; gc.collect()

In [3]:
default_params = {
    'n_components': 10, 
    'max_iter': 5,
    'learning_method': 'online',
    'learning_offset': 50.,
    'random_state': 411, 
    'n_jobs': 3
}

try_params = {
    'n_components': [5, 10, 20, 40]
}
fit_and_try(default_params, try_params)

NameError: name 'cp' is not defined

In [12]:
default_params = {
    'n_components': 20, 
    'max_iter': 5,
    'learning_method': 'online',
    'learning_offset': 70.,
    'learning_decay': .7,
    'random_state': 411, 
    'n_jobs': 3
}

try_params = {
    'max_iter': [10, 30, 60]
}
fit_and_try(default_params, try_params)



OSError: [Errno 28] No space left on device

In [25]:
lda = LatentDirichletAllocation(n_components=10, max_iter=100,
                                learning_method='online', 
                                random_state=411, n_jobs=1)
lda_features = lda.fit_transform(desc_counts)

In [26]:
lda_features.shape

(2011862, 10)

In [27]:
lda.perplexity(desc_counts)

3413.030831206428

In [28]:
lda_features[:3,:]

array([[ 0.27500001,  0.025     ,  0.025     ,  0.025     ,  0.025     ,
         0.27499999,  0.275     ,  0.025     ,  0.025     ,  0.025     ],
       [ 0.36666666,  0.03333333,  0.03333333,  0.03333333,  0.03333333,
         0.03333333,  0.36666667,  0.03333333,  0.03333333,  0.03333333],
       [ 0.05      ,  0.05      ,  0.55      ,  0.05      ,  0.05      ,
         0.05      ,  0.05      ,  0.05      ,  0.05      ,  0.05      ]])

In [29]:
import pickle
with open('lda_features.pickle', 'wb') as handle:
    pickle.dump(lda_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [31]:
n_top_words = 30
tf_feature_names = count_vectorizer_desc.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0: руб 100 шт продам новая 200 полный комплект 500 50 коляска 300 150 12 нов хлопок сумка мес 30 продаются 600 самовывоз 110 кровать 800 цвет 250 70 40 часы
Topic #1: торг возможен года автомобиль куртка обмен авто осень машина 000 продаю весна год цена двигатель салон резина стоит сигнализация комплектация автомобиля пробег км темно продажа музыка договорная птс месяц зима
Topic #2: unknowndescription гб комплекте этаж 11 память усб процессор фи wи ипhоне тип камера видео диск интел ремонта 16 состояние около чехол характеристики питания продаем ноутбук блуетоотh жесткий 10 hд количество
Topic #3: состоянии хорошем отличном продам новый торг состоян идеальном телефон продается памяти 42 комплект экран рабочем поддержка кроссовки могу вещи коробка подарок очень разрешение аккумулятор документы красивый полностью абсолютно сим 86
Topic #4: дом квартира кв рядом квартиру продается дома окна участок доме кухня продам район города районе комнаты ремонт школа магазины квартире ул вод

## English version:

* Topic #0: rub 100 pieces selling new 200 complete set 500 50 stroller 300 150 12 new cotton bag month 30 sold 600 self-export 110 bed 800 color 250 70 40 hours

* Topic #1: auction is possible Year car car exchange car autumn car 000 sell spring year price engine interior rubber is signaling equipment car make mileage dark sale music contract price pts month winter

* Topic #2: unknowndescription gb bundle floor 11 memory usb processor fi wi iphone type camera video drive intel repair 16 condition about case cover power specifications sell laptop hardhath hard 10 hd number

* Topic #3: condition good excellent selling new bargaining is the ideal phone sold memory 42 set screen working support sneakers can things box gift very resolution battery documents beautiful completely absolutely sim 86

* Topic #4: house apartment apartment near apartment for sale house window house plot kitchen sell district city district rooms repair school shops apartment apartment street water is accessibility garden gas area step garage parking bargain

* Topic #5: phone work issues natural urgently repair any 25 installation all call furniture service cost time possibility to install light table cabinet requires work to hand over furnishings documents 15 store work free of charge your

* Topic #6: cm mm length kg height weight material width wheel dimensions child safety 80 characteristics easily 12 color diameter size type very 10 20 23 m2 60 has time 30

* Topic #7: size condition very sell cm new excellent sell dress good color growth skin color 44 give boy new girls 46 photos bargain suit suit worn perfect years jumpsuit new company

* Topic #8: price 00 delivery availability of rubles photo call 10 works warranty write shop 19 store possible 20 russia can shop prices choice black order our range 1000 our goods sizes day

* Topic #9: this is also the connection to call the new time coat bought selling set of work excellent new allows replacement which hands of the month which quality 74 oil absolutely 54 experience thanks to 90 so excellent has