In [None]:
import nltk
from bs4 import BeautifulSoup

## Branje podatkov

In [None]:
def read_from_file():
    data = []
    f = open("../data/novice.txt", "r", encoding="utf-8")
    title = ""
    body = ""
    comments = []
    for line in f:
        line = line.replace("\n", "")
        if line == "":
            data.append( (title, body, comments) )
            title = ""
            body = ""
            comments = []
        elif title == "":
            title = line
        elif body == "":
            body = line
        else:
            comments.append(line)
    return data

In [None]:
news = read_from_file()

## Lematizacija

Klic programa za lematizacijo. Lematiziramo naslove, telo novic in komentarje.

0: uspešno

1: neuspešno

In [None]:
import os
f_title = open("../ObeliksLatest/in_title.txt", "w", encoding="utf-8")
f_body = open("../ObeliksLatest/in_body.txt", "w", encoding="utf-8")
f_comment = open("../ObeliksLatest/in_comment.txt", "w", encoding="utf-8")
for n in news:
    f_title.write(n[0] + "\n\n")
    f_body.write(n[1] + "\n\n")
    for c in n[2]:
        f_comment.write(c + "\n\n")
    f_comment.write("DELIMITER\n\n")
f_title.close()
f_body.close()
f_comment.close()
os.chdir("../ObeliksLatest")
os.system("PosTaggerTag -lem:LemmatizerModel.bin -v -o -t in_title.txt TaggerModel.bin out_title.xml")
os.system("PosTaggerTag -lem:LemmatizerModel.bin -v -o -t in_body.txt TaggerModel.bin out_body.xml")
os.system("PosTaggerTag -lem:LemmatizerModel.bin -v -o -t in_comment.txt TaggerModel.bin out_comment.xml")

Branje lematiziranih besed.

In [None]:
def read_lemmatized(path):
    f  = open(path, "r", encoding="utf-8").read()
    soup = BeautifulSoup(f, "lxml")
    lemmatized = []
    lemmas = soup.find_all("p")
    for i in range(0, len(lemmas), 1):
        lem = []
        for l in lemmas[i].find_all("w"):
            lem.append(l["lemma"])
        lemmatized.append(lem)
    #print(lemmatized)
    result = []
    for news in lemmatized:
        result.append(" ".join([str(x) for x in news]))
    return result

def read_lemmatized_comments(path):
    f  = open(path, "r", encoding="utf-8").read()
    soup = BeautifulSoup(f, "lxml")
    result = []
    lemmatized = []
    lemmas = soup.find_all("p")
    for i in range(0, len(lemmas), 1):
        lem = ""
        skip = False
        for l in lemmas[i].find_all("w"):
            #print(l["lemma"])
            if l["lemma"].lower() == "delimiter":
                result.append(lemmatized)
                lemmatized = []
                skip = True
            else:
                lem += l["lemma"] + " "
        if skip:
            skip = False
        else:
            lemmatized.append(lem)
    return result

titles_lemmatized = read_lemmatized("../ObeliksLatest/out_title.xml")
body_lemmatized = read_lemmatized("../ObeliksLatest/out_body.xml")
comments_lemmatized = read_lemmatized_comments("../ObeliksLatest/out_comment.xml")
print(len(titles_lemmatized))
print(len(body_lemmatized))
print(len(comments_lemmatized))

## Clustering

In [None]:
import re
def tokenize(text):
    # First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token.
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # Filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation).
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
def character_ngram(text):
    n = 3
    result = []
    text = text.lower().replace(".", "").replace(",", "").replace("?", "").replace("!","")
    #text = text.replace(" ", "")
    for i in range(len(text)-n+1):
        result.append(text[i:i+n])
    return result

In [None]:
text = "Janez si umiva zobe. Potem pa bo šel gledat risanke. Mislim, da je na sporedu čarobni Bakugan iz vesolja."
#tokenize(text)
character_ngram(text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(
                        max_df=0.8, 
                        max_features=200000,
                        min_df=0.2, 
                        stop_words='english', 
                        use_idf=True, 
                        tokenizer=character_ngram, 
                        ngram_range=(1,3))

# Fit the vectorizer to synopses texts
news_titles = [title for title,_,_ in news]
#for news in lemmatized:
#    news_text.append(" ".join([str(x) for x in news]))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(body_lemmatized) 

print("TF-IDF matrix shape: {}".format(tfidf_matrix.shape))
print(tfidf_matrix)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
dist

In [None]:
from sklearn.cluster import KMeans

num_clusters = 3
km = KMeans(n_clusters=num_clusters)

# Perform clustering
%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()
print(news_titles)
print("Clusters: {}".format(clusters))

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram
import matplotlib.pyplot as plt

# Define the linkage_matrix using ward clustering pre-computed distances.
linkage_matrix = ward(dist) 

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=news_titles);

plt.tick_params(\
    axis= 'x',         # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

# Uncomment the below to show or save the plot.
plt.show()
#plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters
plt.close()