In [2]:
import numpy as np
import pandas as pd
import os

import re
from datetime import datetime as time
from ast import literal_eval
from sklearn.model_selection import train_test_split

import artm
from gensim.models import Phrases
from gensim.models.phrases import Phraser

import gc
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
!ls -lah data/*.csv.bz2

-rw-rw-r-- 1 vtrokhymenko vtrokhymenko 491K Jan 15 18:18 data/69-.paralel.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko  48M Jan 15 18:21 data/Russia.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko  12M Jan 15 18:21 data/Sil_struktur.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko  44K Jan 15 18:19 data/biblioteka.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko  25M Jan 15 18:19 data/bivs.SSR.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko 6.1M Jan 15 18:18 data/biznes.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko 2.9M Jan 15 18:21 data/cennosti.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko  28M Jan 15 18:21 data/economic.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko  16M Jan 15 18:19 data/internet.i.smi.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko 8.6M Jan 15 18:18 data/iz.jizni.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko 125K Jan 15 18:18 data/kultprosvet.csv.bz2
-rw-rw-r-- 1 vtrokhymenko vtrokhymenko  20M Jan 15 18:18 data/kultura.csv.bz2
-rw-rw-r-- 1 vtrokhymenko v

In [4]:
name_dataset = 'orujie'

### load data

In [5]:
%%time
df = pd.read_csv(f'data/{name_dataset}.csv.bz2')
print(df.shape)

(3, 7)
CPU times: user 5.62 ms, sys: 440 µs, total: 6.06 ms
Wall time: 19 ms


In [6]:
df

Unnamed: 0,1914-lenta/,url,edition,topics,title,text,topic
0,2015-11-27 12:00:00,https://lenta.ru/news/2015/11/27/by_internet/,,Оружие / Вооружение,СМИ сообщили о приобретении парижскими террори...,"По информации следственных органов ФРГ, которы...",Оружие
1,2015-09-15 12:00:00,https://lenta.ru/news/2015/09/15/armata/,,Оружие / Вооружение,Российская армия получит на вооружение 2300 та...,Российская армия получит 2300 танков на базе б...,Оружие
2,2017-11-11 12:00:00,https://lenta.ru/news/2017/11/11/order66/,,Оружие / Вооружение,Россия потратит 700 миллионов рублей на защиту...,Минобороны потратит 735 миллионов рублей на за...,Оружие


### preprocess

* get dates

In [7]:
%%time

df['date_raw'] = df['url'].apply(re.compile('(\d{4})/(\d{2})/(\d{2})').findall)

df['year'] = df['date_raw'].apply(lambda x: int(x[0][0]))
df['month'] = df['date_raw'].apply(lambda x: int(x[0][1]))

CPU times: user 1.88 ms, sys: 0 ns, total: 1.88 ms
Wall time: 1.85 ms




* clean text

In [8]:
import preprocessing_tools as pr

In [9]:
%%time
df.text = df.text.apply(pr.clean_text)

CPU times: user 682 µs, sys: 861 µs, total: 1.54 ms
Wall time: 1.53 ms


In [10]:
df.head(2)

Unnamed: 0,1914-lenta/,url,edition,topics,title,text,topic,date_raw,year,month
0,2015-11-27 12:00:00,https://lenta.ru/news/2015/11/27/by_internet/,,Оружие / Вооружение,СМИ сообщили о приобретении парижскими террори...,по информации следственных органов фрг которые...,Оружие,"[(2015, 11, 27)]",2015,11
1,2015-09-15 12:00:00,https://lenta.ru/news/2015/09/15/armata/,,Оружие / Вооружение,Российская армия получит на вооружение 2300 та...,российская армия получит танков на базе броне...,Оружие,"[(2015, 09, 15)]",2015,9


* cleanedNames

In [11]:
from cleanedNames import KeyProc
cn = KeyProc()

In [12]:
%%time
df.text = df.text.apply(cn.replaceKeywords)

CPU times: user 5.64 ms, sys: 0 ns, total: 5.64 ms
Wall time: 5.62 ms


* lemmatization

In [13]:
%%time
df['lemmatized_text'] = df.text.apply(pr.lemmatization)

CPU times: user 65.3 ms, sys: 0 ns, total: 65.3 ms
Wall time: 64.4 ms


* add docID

In [14]:
df['docID'] = list(range(df.shape[0]))

In [15]:
df = df[['docID','year','month','lemmatized_text']]
df.head(2)

Unnamed: 0,docID,year,month,lemmatized_text
0,0,2015,11,"[информация, следственный, орган, фрг, который..."
1,1,2015,9,"[российский, армия, получить, танк, база, брон..."


In [16]:
print(df.shape)
df.isnull().sum()

(3, 4)


docID              0
year               0
month              0
lemmatized_text    0
dtype: int64

In [17]:
df.year.min()

2015

## model

* prepare

In [18]:
%%time

vwpath = f'data/vwpath/{name_dataset}_input_bigartm.vw'

with open(vwpath, 'w') as fp:
    for text, did in df[['lemmatized_text', 'docID']].values:
        fp.write('{} | {}\n'.format(did, ' '.join(text)))

CPU times: user 1.32 ms, sys: 252 µs, total: 1.57 ms
Wall time: 1.36 ms


In [19]:
id_date = df[['docID', 'year', 'month']].copy()

* collect batchs and collect the dictionary

In [20]:
%%time

batches_path = f'data/batches/{name_dataset}'

if not os.path.exists(batches_path):
    print('create folder...\n')
    os.makedirs(batches_path)
    
batch_vectorizer = artm.BatchVectorizer(data_path=vwpath,
                                        data_format='vowpal_wabbit',
                                        target_folder=batches_path)

print(batch_vectorizer)

create folder...

artm.BatchVectorizer(data_path="data/batches/orujie", num_batches=1)
CPU times: user 22.7 ms, sys: 54.4 ms, total: 77.1 ms
Wall time: 14.9 ms


In [21]:
dictionary = artm.Dictionary()
dictionary.gather(data_path=batches_path)

In [22]:
dictionary.filter(min_tf=10, max_df_rate=0.1) #int(4.5e5)

artm.Dictionary(name=3f2e9f08-4522-476a-b3e7-5e3d8cade221, num_entries=0)

In [23]:
dictionary.save_text(f'data/dicts/dict_{name_dataset}.txt')

### fit model

In [25]:
num_topics = 5
model = artm.ARTM(num_topics=num_topics, dictionary=dictionary, show_progress_bars=True)

InvalidOperationException: Dictionary '3f2e9f08-4522-476a-b3e7-5e3d8cade221' has no entries

In [None]:
model.scores.add(artm.PerplexityScore(name='PerplexityScore',dictionary=dictionary))
model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))

# regularizers
model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.5))
model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))

model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=3)    

In [None]:
print(f"SparsityThetaScore: {model.score_tracker['SparsityThetaScore'].last_value}")
print(f"SparsityPhiScore: {model.score_tracker['SparsityPhiScore'].last_value}")
print(f"PerplexityScore: {model.score_tracker['PerplexityScore'].last_value}")

## pick up amount topics

In [None]:
%%time
model_list = []
step, perplexity, sTheta, sPhi = [], [], [], []

for i in range(2,20+1,1):
    print(f'#topic {i}.........')
    step.append(i)
    date_start = time.now()
    
    num_topics = i
    model = artm.ARTM(num_topics=num_topics, dictionary=dictionary)
    
    # scores
    model.scores.add(artm.PerplexityScore(name='PerplexityScore',dictionary=dictionary))
    model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))

    # regularizers
    model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
    model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.5))
    model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))
    
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=50)
    model_list.append(model)
    
    sparsityTheta = model.score_tracker['SparsityThetaScore'].last_value
    sTheta.append(sparsityTheta)
    sparsityPhi = model.score_tracker['SparsityPhiScore'].last_value
    sPhi.append(sparsityPhi)
    perpl = model.score_tracker['PerplexityScore'].last_value
    perplexity.append(perpl)
    
    print(f'\tSparsityThetaScore: {sparsityTheta}')
    print(f'\tSparsityPhiScore: {sparsityPhi}')
    print(f'\tPerplexityScore: {perpl}')
    
    print('\ttime:',time.now()-date_start)
    
newTime = time.now() #

In [None]:
#model_list

In [None]:
plt.plot(step, sPhi, color='k',linewidth=3, marker='x')
ax = plt.gca()
ax.set_xticks(step)
plt.grid()
plt.title('SparsityPhiScore')

In [None]:
index_sphi = []
[index_sphi.append(float(str(i)[:3]) ) for i in sPhi]
    
index_sphi = list(set(index_sphi))
index_sphi.sort()
index_sphi

In [None]:
d_sphi = {}

for j in range(len(index_sphi)):
    k=0
    for i in sPhi:
        if j==range(len(index_sphi))[-1]:
            if (i>=index_sphi[j]):
                k+=1
        else:
            if (i>=index_sphi[j]) & (i<index_sphi[j+1]):
                k+=1
    d_sphi[index_sphi[j]]=k

d_sphi

In [None]:
plt.plot(step, sTheta, color='g',linewidth=3, marker='x')
ax = plt.gca()
ax.set_xticks(step)
plt.grid()

plt.title('SparsityThetaScore')

In [None]:
index_theta = []
[index_theta.append(float(str(i)[:3])) for i in sTheta]
    
index_theta = list(set(index_theta))
index_theta.sort()
index_theta

In [None]:
d_stheta = {}

for j in range(len(index_theta)):
    k=0
    for i in sTheta:
        if j==range(len(index_theta))[-1]:
            if (i>=index_theta[j]):
                k+=1
        else:
            if (i>=index_theta[j]) & (i<index_theta[j+1]):
                k+=1
    d_stheta[index_theta[j]]=k

d_stheta

In [None]:
plt.plot(step, perplexity, color='b',linewidth=3, marker='x')
ax = plt.gca()
ax.set_xticks(step)
plt.grid()
plt.title('PerplexityScore')

In [None]:
perplexity

### select the best model

In [None]:
print(f'd_sphi:   {d_sphi}')
print(f'd_stheta: {d_stheta}')

In [None]:
best_model = model_list[-1]

### print top words for each topics

In [None]:
phi = best_model.get_phi()
phi['word'] = phi.index
#phi.head()

for col in phi.columns:
    if col != 'word':
        print(f"{col}: {phi[[col, 'word']].sort_values(by=col, ascending=False)['word'].values[:10]}")
        #print('\t',phi[[col, 'word']].sort_values(by=col, ascending=False)['word'].values[:10])

### get topics-docs matrix

In [None]:
theta = best_model.transform_sparse(batch_vectorizer=batch_vectorizer, eps=0.001)
print(f'shape theta: {theta[0].shape}')

plt.figure(figsize=(20, 20))
plt.imshow(theta[0][:, 2000:2500].toarray());

### dense theta matrix

In [None]:
theta = best_model.transform(batch_vectorizer=batch_vectorizer)
theta = theta.T
theta.head()

## DataViz

In [None]:
joined = id_date.join(theta)
joined.head()

In [None]:
topics = ['topic_{}'.format(i) for i in range(num_topics)]
gb = joined.groupby(['year', 'month'])[topics].sum()
print(f'gb.shape: {gb.shape}')
gb.head()

### straightforward matrix heatmap

In [None]:
plt.figure(figsize=(10, 40))
plt.imshow(gb[topics])

## pygal

In [None]:
import pygal
from IPython.display import SVG, HTML

In [None]:
html_pygal = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/pygal-tooltips.js"></script>
    <!-- ... -->
  </head>
  <body>
    <figure>
      {pygal_render}
    </figure>
  </body>
</html>
"""

In [None]:
line_chart = pygal.StackedLine(fill=True, show_dots=False)
line_chart.title = 'Topics'
for topic in topics:
    line_chart.add(topic, gb[topic].values)
    
HTML(html_pygal.format(pygal_render=line_chart.render(is_unicode=True)))

In [None]:
line_chart.render_to_file(f'data/visualization/{name_dataset}_pygal.svg')

## joypy

In [None]:
import joypy # !pip install joypy

In [None]:
print(joined.shape)
joined.head()

In [None]:
joined['year_month'] = joined.apply(lambda x: str(int(x.year)) +'-'+ str(int(x.month)) + '-01', axis=1)
joined.reset_index(inplace=True)

joined.head()

In [None]:
joined_melt = joined.melt('year_month', var_name='topic', value_name='text')
print(joined_melt.shape)
joined_melt.head()

In [None]:
# select only topic_*
joined_melt_new = joined_melt[(joined_melt.topic!='docID') & (joined_melt.topic!='index')
                              & (joined_melt.topic!='year') & (joined_melt.topic!='month')]
print(joined_melt_new.shape)
joined_melt_new.topic.value_counts()

In [None]:
labels = sorted(joined_melt_new['year_month'].unique())

fig, axes = joypy.joyplot(joined_melt_new, by='topic', column='text', figsize=(20, 10),xlabelsize=20,ylabelsize=20,
                          overlap=0.4, fill=True, linecolor="k", linewidth=2,
                          kind='values', fade=True, xrot=90, x_range=[i for i in range(len(labels))],
                          background='white');

ticks_labels = {i:t for i, t in enumerate(labels)}
axes[-1].set_xticks([k for k, v in ticks_labels.items() if k % 6 == 0])
ticks = axes[-1].get_xticks()
axes[-1].set_xticklabels([ticks_labels.get(i, 'stop') for i in ticks]);

In [None]:
fig.savefig(f'data/visualization/{name_dataset}_joypy')