# LDA clustering for articles from pit.pl

## Performance problems

In [1]:
import pyMorfologik
from pyMorfologik import Morfologik
from pyMorfologik.parsing import ListParser
import nltk

In [2]:
import numpy as np
import pandas as pd
import re
from copy import deepcopy
from sklearn import feature_extraction


In [3]:
scraping = pd.read_csv('dane-bez-param.csv')
#dataframe with scraped articles

In [4]:
documents = scraping['text'] #only plain text from each URL
len(documents)

7900

In [5]:
#text after an author's name is not important for further analysis. Regular expression below should remove it. 
#It must be done before removing end-of-line characters (/n)  
import re
re.compile('[a-zA-Z]',  re.UNICODE) #compiling regex for non-english characters
#for author's description within a domain's name
articles = [re.sub("\\n+_?[A-Z][a-z]+\s[A-Z][a-z]+\s*\\n?[A-Za-z\s]+\.pl.*(\\n.*)*", " ", str(a)) for a in documents]


In [6]:
def words_num(set):
    words=0
    for a in set:
        words += len(a)
    return words
def set_reduction(set1,set2):
    
    return print('Zbiór zmniejszył się o: {} słów. Wynosi: {:.5f} początkowego '
                 .format((words_num(set1) - words_num(set2)), (words_num(set2)/words_num(set1)))) 
set_reduction(documents, articles)

Zbiór zmniejszył się o: 2481106 słów. Wynosi: 0.93614 początkowego 


In [7]:
#for author's description without a domain name 
articles = [re.sub("\\n+_?[A-Z]\w+\s[A-Z]\.?\s[A-Z]\w+\s*\\n\\n.*", " ", str(a)) for a in articles]


In [8]:
set_reduction(documents, articles)

Zbiór zmniejszył się o: 2502962 słów. Wynosi: 0.93557 początkowego 


In [9]:
#removing local and global urls 
articles = [re.sub("/[a-zA-Z0-9_\-\./]+|htt[^\s]*", " " , str(a)) for a in articles ]

In [10]:
set_reduction(documents, articles)

Zbiór zmniejszył się o: 2992233 słów. Wynosi: 0.92298 początkowego 


In [11]:
#clearing articles of special characters and punctuations   

articles = [re.sub("\\n", " ", str(a)) for a in articles]
articles = [re.sub("[\*|\.|,|:|;|\?|!|#|\(|\)|-]", " ", str(a)) for a in articles]


In [12]:
set_reduction(documents, articles)

Zbiór zmniejszył się o: 2992233 słów. Wynosi: 0.92298 początkowego 


In [13]:
#for articles without an author's name deleting the last part of the article containing dates, comments and autopromotional texts

articles2 = [re.sub("data utworzenia.*|Komentarze.*", "", str(a)) for a in articles]

In [14]:
set_reduction(documents, articles2)

Zbiór zmniejszył się o: 5653975 słów. Wynosi: 0.85446 początkowego 


In [15]:
reduction=[]
for a in articles:
    m = re.search("(data utworzenia.*|Komentarze.*)", str(a))
    if m:
        found = m.group(1)
        reduction.append(found)
#reduction[:5]    

In [16]:
articles = [re.sub("data utworzenia.*|Komentarze.*", "", str(a)) for a in articles]

## Stemming

In [17]:
#stopwords = nltk.corpus.stopwords.words('english')
stopwords = pd.read_csv("stopwords-pl.txt", header=None)

In [18]:
type(stopwords)
stopwords = stopwords.iloc[:,0]
stopwords = stopwords.tolist()


In [19]:
parser = ListParser()
stemmer = Morfologik()
#nltk.download()

articles = [a.lower() for a in articles]
articles = [re.sub("\d+", "", str(a)) for a in articles]
articles = [nltk.word_tokenize(a, language='polish') for a in articles]
articles = [[w for w in a if w not in stopwords] for a in articles]




In [20]:
from collections import Counter
def counting(set):
    c= Counter(set)
    aCounted={}
    for i in set:
        if i not in aCounted:
            aCounted[i]=c[i]

    sort_count = sorted(aCounted.items(), key=lambda item: item[1], reverse=True)

    return pd.DataFrame(sort_count)


In [21]:
import pickle
file_temp=open('stemmed2.obj', 'rb')
articles_stemmed=pickle.load(file_temp)

In [22]:
unamb_corpora=[]
for a in articles_stemmed:
    unamb=[]
    for i in range(len(a)):
        if len(a[i][1])==1:
            unamb+=a[i][1]
        elif len(a[i][1])==0:
            unamb+=a[i][0]
    unamb_corpora+=unamb #full match for all corpora      

## Dividing set for ambigous and nonambigous words

## Joining and preparing data for the model

In [23]:
#reading parts of stemmed_arts from other jupyter notebooks

file_a1 = open('st_a1.obj', 'rb') 
st_a1 = pickle.load(file_a1)
file_a2 = open('st_a2.obj', 'rb') 
st_a2 = pickle.load(file_a2)
file_a3 = open('st_a3.obj', 'rb') 
st_a3 = pickle.load(file_a3)
file_a4 = open('st_a4.obj', 'rb') 
st_a4 = pickle.load(file_a4)
file_a5 = open('st_a5.obj', 'rb') 
st_a5 = pickle.load(file_a5)


In [24]:
stemmed_arts=[]
stemmed_arts=st_a1+st_a2+st_a3+st_a4+st_a5

In [25]:
len(stemmed_arts)

7900

In [26]:
#for the model a plain text is needed and in stemmed_arts we have lists
docs =[]
for a in stemmed_arts:
    b = " ".join(str(x) for x in a)
    docs.append(b)
len(docs)

7900

## Setting up the models

In [27]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


#taking two methods of vectorization - by tfidf and standard counting. 
#I suppose that for articles with narrow topics TFIDF may work better 
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
tfidf = tfidf_vectorizer.fit_transform(docs)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

#standard counting
cv_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
cv = cv_vectorizer.fit_transform(docs)
cv_feature_names = cv_vectorizer.get_feature_names()


In [28]:
no_topics = 15

vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, stop_words=stopwords)
data = vectorizer.fit_transform(docs)
# LDA model

lda_model = LatentDirichletAllocation(
    n_components=no_topics, max_iter=10, learning_method='online')
lda_tfidf = lda_model.fit_transform(data) 
#LSI model
lsi_model = TruncatedSVD(n_components=no_topics)
lsi_tfidf = lsi_model.fit_transform (data)


In [29]:
print(lda_tfidf[0])
print(lsi_tfidf[0])

[ 0.00592673  0.00592673  0.00592673  0.00592673  0.00592673  0.00592673
  0.91702574  0.00592673  0.00592673  0.00592673  0.00592673  0.00592673
  0.00592673  0.00592673  0.00592673]
[ 0.0421758   0.03043607 -0.00292599  0.05051659  0.03408432  0.24304825
 -0.14179037 -0.17725029 -0.01706335 -0.01934152 -0.03210418 -0.11124058
 -0.05168739  0.32430181  0.04203248]


In [30]:
def display_topics(model, vectorizer, no_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print ([(vectorizer.get_feature_names()[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

no_top_words = 10
print("LDA z tf-idf: ")
display_topics(lda_model, vectorizer)
print("--" *10)

print("LSI z tf-idf: ")
display_topics(lsi_model, vectorizer)

LDA z tf-idf: 
Topic 0:
['fila', 'artykułów', 'korekta', 'korygować', 'deklaracja', 'faktura', 'ranking', 'podatek', 'podatnik', 'termin']
Topic 1:
['ratalny', 'tygodniowo', 'godzina', 'praca', 'artykułów', 'kilometrówka', 'grzywna', 'czas', 'święto', 'skarbowy']
Topic 2:
['rekomendacja', 'wzorzec', 'składka', 'wkład', 'fax', 'bezterminowy', 'działalność', 'kredyt', 'ciągły', 'osoba']
Topic 3:
['kocioł', 'stalowy', 'aluminium', 'żeliwo', 'obrobić', 'folia', 'spiekać', 'żelazostop', 'staliwo', 'drążyć']
Topic 4:
['wyjaśnienie', 'upoważnienia', 'przeznaczenia', 'wykroczeniem', 'zakończony', 'wiążąca', 'określenia', 'księdze', 'przełożonego', 'zastrzeżenia']
Topic 5:
['korporacja', 'artykułów', 'konsument', 'skarga', 'sprzedawca', 'odstąpić', 'wierzyciel', 'sąd', 'podróże', 'złoty']
Topic 6:
['pity', 'artykułów', 'stawki', 'podróże', 'robót', 'ul', 'bazie', 'rozliczenia', 'podatek', 'pita']
Topic 7:
['przywóz', 'wywóz', 'zwolnić', 'złoty', 'składka', 'kwota', 'podatek', 'brutto', 'najemca

In [31]:
no_topics = 15

cv_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
data2 = cv_vectorizer.fit_transform(docs)
# LDA model

lda_model2 = LatentDirichletAllocation(
    n_components=no_topics, max_iter=10, learning_method='online')
lda_cv = lda_model2.fit_transform(data2) 
#LSI model
lsi_model2 = TruncatedSVD(n_components=no_topics)
lsi_cv = lsi_model2.fit_transform(data2)


In [32]:
no_top_words = 10
print("LDA z Count Vectorizer: ")
display_topics(lda_model2, cv_vectorizer)
print("--" *10)

print("LSI z Count Vectorizer: ")
display_topics(lsi_model2, cv_vectorizer)

LDA z Count Vectorizer: 
Topic 0:
['pity', 'dacie', 'wynagrodzenia', 'zdaniem', 'akt', 'rozliczenia', 'usługi', 'muszą', 'wcześniej', 'proc']
Topic 1:
['dacie', 'składka', 'działalność', 'pow', 'przedsiębiorca', 'koszty', 'akt', 'odsetki', 'zatrudnionej', 'gospodarczy']
Topic 2:
['pity', 'dacie', 'nowego', 'rozliczenia', 'ramach', 'akt', 'dokonanego', 'warunki', 'stanowi', 'określonych']
Topic 3:
['pity', 'dacie', 'polskich', 'rozliczenia', 'jednego', 'polskiego', 'kwietnia', 'przychodów', 'zatrudnionym', 'proc']
Topic 4:
['artykułów', 'podróże', 'dzień', 'pity', 'podatkowy', 'rozliczenie', 'mniej', 'doświadczenie', 'gazeta', 'prowadzący']
Topic 5:
['dacie', 'podatek', 'ustawa', 'artykuł', 'sprzedaż', 'usta', 'towar', 'nieruchomość', 'interpretacja', 'wartość']
Topic 6:
['pity', 'plac', 'podatkowy', 'doświadczenie', 'informacja', 'skarbowy', 'wynagrodzenia', 'rozliczenia', 'kontrola', 'numer']
Topic 7:
['stawki', 'artykułów', 'pity', 'podróże', 'dacie', 'poglądów', 'odliczenia', 'opoda

In [33]:
doc_topic = lda_model.transform(data)
#for n in range(doc_topic.shape[0]):


In [34]:
for n in range(10):

    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}".format(n,topic_most_pr))
    print(scraping.iloc[n,0])


doc: 0 topic: 6


# Małżeństwo i dzieci


doc: 1 topic: 6


# PIT-36 działalność gospodarcza


doc: 2 topic: 6


# Kalkulator płacowy


doc: 3 topic: 6


# Kalkulator stawek netto i brutto z faktur VAT


doc: 4 topic: 6


# Kalkulator składek ZUS


doc: 5 topic: 6


# Kalkulator kilometrówki


doc: 6 topic: 6


# Kalkulator wynagrodzeń netto i brutto


doc: 7 topic: 6


# Złożenie zeznania


doc: 8 topic: 6


# Adresy administracji skarbowej


doc: 9 topic: 6


# Adresy administracji skarbowej




## Data visualization

In [35]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [37]:
svd = TruncatedSVD(n_components=2)
words_2 = svd.fit_transform(data2.T)

df = pd.DataFrame(columns=['x','y', 'word'])
df['x'], df['y'], df['word'] = words_2[:,0], words_2[:,1], cv_vectorizer.get_feature_names()
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x='x', y='y', text='word', y_offset=8, source=source, text_color='red', text_align='center')
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=3, source=source)
plot.add_layout(labels)
show(plot, notebook_handle=True)

## LDA visualization

In [39]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data, vectorizer, mds='tsne')
panel

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [49]:
import random
r = range(doc_topic.shape[0])
p = random.sample(r, 10)


for n in p:

    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}".format(n,topic_most_pr))
    print(scraping.iloc[n,0])


doc: 2811 topic: 6


# Małżeństwo i dzieci


doc: 2913 topic: 6


# PITy roczne 2007


doc: 6653 topic: 6


# Prasa


doc: 3616 topic: 6


# Nie chcesz płacić podatku - załóż hodowlę akwariową


doc: 1973 topic: 6


# PIT-y roczne 2012


doc: 517 topic: 6


# Prasa


doc: 6281 topic: 6


# Równoległa praca i nauka w stażu urlopowym


doc: 7150 topic: 6


# Prasa


doc: 3996 topic: 6


# VAT


doc: 2083 topic: 6


# PITy roczne 2010




## Conclusions from the stemming method