# Pierwsze modele

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
from get_nice_text import *

import pandas as pd
import re
import numpy as np

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering, DBSCAN, MiniBatchKMeans, MeanShift, SpectralClustering
from sklearn.mixture import GaussianMixture

In [2]:
def train_fast(df, Model, labels, **kwargs):
    """
    Trenuje Model na sparse matrix df
    @example
    train fast(df, KMeans, n_clusters=8)
    """
    trans = TfidfTransformer()
    df_trans = trans.fit_transform(df)
    model = Model(**kwargs)
    lab = model.fit_predict(df_trans)
    
    ret = {}
    
    ret["homogeneity_score"] = homogeneity_score(labels, lab)
    ret["completeness_score"] = completeness_score(labels, lab)
    ret["v_measure_score"] = v_measure_score(labels, lab)
    
    return lab, ret

In [3]:
df = get_nice_text()
labels = get_labels(True)
np.random.seed(123)

Naiwne podejście - sam wordbag dajmy do pogrupowania, Biblię mergujemy

In [37]:
cv = CountVectorizer()
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.3320610459866629,
 'completeness_score': 0.4986041642194423,
 'v_measure_score': 0.39863718443904317}

Bardzo słabo. Dodajmy TfidfTransformer.

In [38]:
cv = CountVectorizer()
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, KMeans, get_labels(True), n_clusters=5, random_state = 123)
scores

{'homogeneity_score': 0.5504130721916395,
 'completeness_score': 0.5575716186373115,
 'v_measure_score': 0.5539692201909778}

Lepiej, przekroczyliśmy losowość 0.5. Weżmy pod uwagę stopwords

In [41]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, KMeans, get_labels(True), n_clusters=5, random_state = 123)
scores

{'homogeneity_score': 0.5499682967296581,
 'completeness_score': 0.6450909053251002,
 'v_measure_score': 0.5937438845329811}

Zobaczmy jeszcze czy mergowanie Bibli coś daje

In [28]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, KMeans, get_labels(False), n_clusters=8, random_state = 123)
scores

{'homogeneity_score': 0.5073841435249417,
 'completeness_score': 0.4465988909388149,
 'v_measure_score': 0.475054980208431}

Jak widać tak

# Entities
Jako vocabulary dodajmy tylko enitities

In [71]:
def get_entities(data):

    entities_list = []
    
    for i in range(len(data)):
        
        chapter = str(data[i])
        doc = nlp(chapter) 

        for ent in doc.ents: 
            entities_list.append(ent.text)
    
    return entities_list

In [87]:
import spacy 
nlp = spacy.load('en_core_web_sm') 
ent = get_entities(df)

In [96]:
from collections import defaultdict 
  
LABELS_DICT = defaultdict(set) 

for word in df:
    doc = nlp(str(word))

    for e in doc.ents:
        LABELS_DICT[e.label_].add(e.text)

In [110]:
LABELS_DICT.keys()
all_nlp = []
for key in LABELS_DICT.keys():
    all_nlp += list(LABELS_DICT[key])
    
#remove duplicates
all_nlp = list(set(all_nlp))

In [111]:
cv = CountVectorizer(vocabulary=all_nlp)
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.11897558584368266,
 'completeness_score': 0.22483275054993732,
 'v_measure_score': 0.15560767661491615}

Słabo, dodajmy stop_words

In [112]:
cv = CountVectorizer(vocabulary=all_nlp, stop_words='english')
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.14612903898906285,
 'completeness_score': 0.39871032617470714,
 'v_measure_score': 0.21387278718898287}

Niewiele lepiej

## Inne modele
### AgglomerativeClustering
z argumentem linkage{“ward”, “complete”, “average”, “single”}

In [6]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

In [8]:
def train_fast(df, Model, labels, **kwargs):
    """
    Trenuje Model na sparse matrix df
    @example
    train fast(df, KMeans, n_clusters=8)
    """
    trans = TfidfTransformer()
    df_trans = trans.fit_transform(df)
    model = Model(**kwargs)
    lab = model.fit_predict(df_trans.toarray())
    
    ret = {}
    
    ret["homogeneity_score"] = homogeneity_score(labels, lab)
    ret["completeness_score"] = completeness_score(labels, lab)
    ret["v_measure_score"] = v_measure_score(labels, lab)
    
    return lab, ret

In [12]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(True), n_clusters=5)
scores

{'homogeneity_score': 0.572923993295591,
 'completeness_score': 0.6047451991605944,
 'v_measure_score': 0.5884046838430234}

In [11]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(True), n_clusters=5, linkage =  "complete")
scores

{'homogeneity_score': 0.3970246607100051,
 'completeness_score': 0.4687726141023961,
 'v_measure_score': 0.42992578858479946}

In [10]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(True), n_clusters=5, linkage =  "average")
scores

{'homogeneity_score': 0.20293471924776588,
 'completeness_score': 0.6412603791279626,
 'v_measure_score': 0.3083031286332579}

In [9]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(True), n_clusters=5, linkage =  "single")
scores

{'homogeneity_score': 0.007543664124885699,
 'completeness_score': 0.22724173007689247,
 'v_measure_score': 0.014602571788471418}

Coraz gorzej

### DBSCAN

In [51]:
lab, scores = train_fast(df_count, DBSCAN, get_labels(True), eps = 0.4, metric = 'manhattan')
scores

{'homogeneity_score': -4.2374126293898904e-16,
 'completeness_score': 1.0,
 'v_measure_score': -8.474825258779785e-16}

In [48]:
lab

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.

## no dobra, a jak będziemy grupować po 8?
### AgglomerativeClustering

In [33]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(False), n_clusters=8)
scores

{'homogeneity_score': 0.5594447643120913,
 'completeness_score': 0.5330019854326977,
 'v_measure_score': 0.5459033498665863}

In [34]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(False), n_clusters=8, linkage =  "complete")
scores

{'homogeneity_score': 0.40259838528734326,
 'completeness_score': 0.35210201942854397,
 'v_measure_score': 0.37566086778954255}

In [35]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(False), n_clusters=8, linkage =  "average")
scores

{'homogeneity_score': 0.2370086420254117,
 'completeness_score': 0.5045610676796211,
 'v_measure_score': 0.32251946622039124}

In [36]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(False), n_clusters=8, linkage =  "single")
scores

{'homogeneity_score': 0.02146117023707067,
 'completeness_score': 0.4280269739509705,
 'v_measure_score': 0.040872979066504986}

Still bad, ale minimalnie lepiej niż po 5

### GaussianMixture

In [15]:
lab, scores = train_fast(df_count, GaussianMixture, get_labels(False), n_components=8)
scores

KeyboardInterrupt: 

Niestety nie zbiega

### MiniBatchKMeans

In [41]:
lab, scores = train_fast(df_count, MiniBatchKMeans, get_labels(False), n_clusters=8)
scores

{'homogeneity_score': 0.5986593612799581,
 'completeness_score': 0.5945854116019115,
 'v_measure_score': 0.5966154318468916}

In [37]:
lab, scores = train_fast(df_count, MiniBatchKMeans, get_labels(True), n_clusters=5)
scores

{'homogeneity_score': 0.5265192377286467,
 'completeness_score': 0.5382273166930658,
 'v_measure_score': 0.5323089055007681}

## Część 2 - Eksperymenty

In [1]:
from collections import Counter
from textblob import Word

from get_nice_text import *

In [2]:
data = get_nice_text()

Example: 

In [3]:
import spacy 
  
nlp = spacy.load('en_core_web_sm') 
  
sentence = str(data[0])
  
doc = nlp(sentence) 
  
for ent in doc.ents: 
    print(ent.text, ent.label_) 

Buddha PERSON
Rahula PERSON
The Buddha Rahula GPE
Rahula The Buddha PERSON
Rahula GPE
The Buddha WORK_OF_ART
Rahula PERSON
Rahula PERSON
Gratified PERSON
Rahula PERSON


Funkcja do szybkiego wyciągnięcia entities

In [4]:
def get_entities(data):

    def unique(arr): 
    
        un = []

        for elem in arr: 
            if  elem not in un : 
                un.append(elem)

        return un

    entities_list = []
    for i in range(len(data)):
        chapter = str(data[i])
        doc = nlp(chapter) 

        for ent in doc.ents: 
            extracted_ent = [ent.text, ent.label_]
            entities_list.append(extracted_ent)
    
    out = unique(entities_list)
    
    return out
        

In [5]:
gp = get_entities(data)

In [6]:
gp[:40]

[['Buddha', 'PERSON'],
 ['Rahula', 'PERSON'],
 ['The Buddha Rahula', 'GPE'],
 ['Rahula The Buddha', 'PERSON'],
 ['Rahula', 'GPE'],
 ['The Buddha', 'WORK_OF_ART'],
 ['Gratified', 'PERSON'],
 ['Kosambi', 'GPE'],
 ['Simsapa', 'PERSON'],
 ['Birth', 'PERSON'],
 ['five', 'CARDINAL'],
 ['one', 'CARDINAL'],
 ['two', 'CARDINAL'],
 ['three', 'CARDINAL'],
 ['twelve', 'CARDINAL'],
 ['four', 'CARDINAL'],
 ['Unprovoked', 'GPE'],
 ['Savatthi', 'ORG'],
 ['Monks', 'NORP'],
 ['The Buddha  Clinging', 'WORK_OF_ART'],
 ['MahaKotthita', 'ORG'],
 ['MahaKotthita Sariputta', 'PRODUCT'],
 ['earth', 'LOC'],
 ['Sariputta', 'PRODUCT'],
 ['the internal water property', 'ORG'],
 ['phlegm', 'PERSON'],
 ['the external water property', 'ORG'],
 ['windy', 'PERSON'],
 ['Sister Dhammadinna', 'PERSON'],
 ['six', 'CARDINAL'],
 ['Consciousness   Consciousness', 'ORG'],
 ['Dhamma', 'ORG'],
 ['First', 'ORDINAL'],
 ['Release', 'PRODUCT'],
 ['Dispassion', 'ORG'],
 ['Consciousness    Fabrications    ', 'ORG'],
 ['One', 'CARDINAL'

### Lemmatization - converting to root word

In [7]:
str(data[300])[:500]

'ing   We have thought of ourselves  perhaps  as creatures moving upon this earth  rather helpless  at the mercy of storm and hunger and our enemies  We are to think of ourselves as immortals  dwelling in the Light  encompassed and sustained by spiritual powers  The steady effort to hold this thought will awaken dormant and unrealized powers  which will unveil to us the nearness of the Eternal   '

In [8]:
tmp = " ".join([Word(word).lemmatize() for word in str(data[300]).split()])
tmp[:500]

'ing We have thought of ourselves perhaps a creature moving upon this earth rather helpless at the mercy of storm and hunger and our enemy We are to think of ourselves a immortal dwelling in the Light encompassed and sustained by spiritual power The steady effort to hold this thought will awaken dormant and unrealized power which will unveil to u the nearness of the Eternal'

### Stemming - getting "base" of word

In [9]:
str(data[500])[:500]

' set before thy face      And put a knife to thy throat  if it be so that thou have thy soul in thy own power      Be not desirous of his meats  in which is the bread of deceit      Labour not to be rich  but set bounds to thy prudence      Lift not up thy eyes to riches which thou canst not have  because they shall make themselves wings like those of an eagle  and shall fly towards heaven      Eat not with an envious man  and desire not his meats      Because  like a soothsayer  and diviner  he'

In [11]:
tmp = " ".join([Word(word).stem() for word in str(data[500]).split()])
tmp[:499]

'set befor thi face and put a knife to thi throat if it be so that thou have thi soul in thi own power Be not desir of hi meat in which is the bread of deceit labour not to be rich but set bound to thi prudenc lift not up thi eye to rich which thou canst not have becaus they shall make themselv wing like those of an eagl and shall fli toward heaven eat not with an enviou man and desir not hi meat becaus like a soothsay and divin he thinketh that which he knoweth not eat and drink will he say to '

In [12]:
tmp = [Word(word).stem() for word in str(data[500]).split()]
tmp[:10]

['set', 'befor', 'thi', 'face', 'and', 'put', 'a', 'knife', 'to', 'thi']

In [13]:
base = str(data[500]).split()
stemmed = tmp
base[:10], stemmed[:10]

(['set', 'before', 'thy', 'face', 'And', 'put', 'a', 'knife', 'to', 'thy'],
 ['set', 'befor', 'thi', 'face', 'and', 'put', 'a', 'knife', 'to', 'thi'])

In [14]:
def get_ending(base, stemmed): 
    """
    returns list of endings, so it might be different length from original vectors (when there is no ending)
    """
    
    
    out = []
    
    for i in range(len(base)): 
        st = stemmed[i]
        bs = base[i]
        diff = len(bs) - len(st)
        
        if diff > 0 : 
            out.append(base[i][-diff:])
    
    return(out)

In [15]:
get_ending(base, stemmed)[:10]

['e', 'ous', 's', 's', 's', 'e', 's', 'es', 'e', 'es']

In [16]:
def count_endings(data):
    n = len(data)
    
    dicts = [0 for i in range(n)]
    for i in range(n):
        chapter = data[i]
        stemmed = [Word(word).stem() for word in str(chapter).split()]
        base = str(chapter).split()
        
        endings = get_ending(base, stemmed)
        
        # makes dictionary
        counted = Counter(endings)
        dicts[i] = counted
        
    return(dicts)

In [17]:
list_of_endings = count_endings(data)
list_of_endings[:5]

[Counter({'ion': 30,
          's': 51,
          'ed': 19,
          'er': 1,
          'lful': 5,
          'ful': 17,
          'ences': 8,
          'ely': 1,
          'e': 9,
          'ing': 7,
          'ng': 3,
          'eable': 1,
          'ly': 2,
          'es': 2,
          'ated': 1,
          'atives': 3}),
 Counter({'e': 11,
          'ed': 5,
          's': 20,
          'ing': 8,
          'es': 3,
          'ous': 3,
          'ment': 2,
          'ion': 8,
          'ation': 3}),
 Counter({'e': 16,
          's': 36,
          'ity': 4,
          'ion': 11,
          'ful': 8,
          'ng': 5,
          'ation': 4,
          'd': 2,
          'ting': 1,
          'ed': 5,
          'ing': 3,
          'ates': 1,
          'es': 2,
          'ered': 1,
          'ment': 1,
          'fulness': 1,
          'ative': 1,
          'ions': 1}),
 Counter({'e': 31,
          'ing': 9,
          'ment': 1,
          'ation': 4,
          's': 30,
          'ed': 11,
   

### mamy narzędzia, co z tym zrobić? Modele! 

In [19]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from get_nice_text import *
import pandas as pd
import re
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
from textblob import Word
import spacy 
nlp = spacy.load('en_core_web_sm') 
  

In [20]:
data = get_nice_text()

### Stemming 

In [21]:
stemmed = [0 for i in range(len(data))]
for i in range(len(data)):
    tmp = " ".join([Word(word).stem() for word in str(data[i]).split()])
    stemmed[i] = tmp

In [22]:
len(stemmed)

590

In [23]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(stemmed)

In [24]:
count_vect_df = pd.DataFrame(df_count.todense(), columns=cv.get_feature_names())

Przekształcając z macierzy rzadkiej otrzymujemy: 

In [26]:
count_vect_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,yesterday,yield,yieldeth,yoga,yoke,young,yourselv,youth,zeal,zorobabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Co z zakończeniami? Użyjemy ich!

In [27]:
def get_ending(base, stemmed): 
    """
    returns list of endings, so it might be different length from original vectors (when there is no ending)
    """


    out = []

    for i in range(len(base)): 
        st = stemmed[i]
        bs = base[i]
        diff = len(bs) - len(st)

        if diff > 0 : 
            out.append(base[i][-diff:])

    return(out)

def count_endings(data):
    
    n = len(data)
    
    dicts = [0 for i in range(n)]
    for i in range(n):
        chapter = data[i]
        stemmed = [Word(word).stem() for word in str(chapter).split()]
        base = str(chapter).split()
        
        endings = get_ending(base, stemmed)
        
        # makes dictionary
        counted = Counter(endings)
        dicts[i] = counted
        
    return(dicts)

In [28]:
def endings(data):
    n = len(data)

    endings = [0 for i in range(n)] 

    for i in range(n): 
        chapter = data[i]
        stemmed = [Word(word).stem() for word in str(chapter).split()]
        base = str(chapter).split()

        ending = get_ending(base, stemmed)
        tmp = " ".join(ending)

        endings[i] = tmp
    
    return endings


In [29]:
endings = endings(data)

In [30]:
cv = CountVectorizer(stop_words='english')
endings_count = cv.fit_transform(endings)
count_vect_endings = pd.DataFrame(endings_count.todense(), columns=cv.get_feature_names())

In [31]:
whole_df = pd.concat([count_vect_df ,count_vect_endings], axis = 1) 
whole_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,ous,ously,ousness,ped,ping,pings,red,ring,ted,ting
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
586,0,0,0,0,0,0,1,0,0,0,...,3,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [32]:
from scipy import sparse

Mając macierz rzadką możemy zrobić model

In [34]:
df = sparse.csr_matrix(whole_df)

In [35]:
trans = TfidfTransformer()
x = trans.fit_transform(df)

In [36]:
model = KMeans(n_clusters=8, random_state = 123)
lab = model.fit_predict(x)

In [37]:
lab

array([4, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 3, 4,
       4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3,
       3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 7,
       4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 0, 0, 0,
       0, 0, 0, 7, 7, 7, 7, 7, 7, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       6, 4, 4, 4, 1, 6, 7, 4, 4, 4, 1, 1, 4, 6, 4, 4, 6, 4, 4, 4, 6, 2,
       4, 4, 7, 7, 7, 2, 0, 0, 0, 4, 4, 1, 1, 0, 4, 0, 4, 0, 0, 0, 0, 7,
       0, 0, 4, 4, 4, 3, 4, 0, 0, 0, 0, 4, 4, 1, 0, 4, 4, 0, 4, 1, 1, 7,
       1, 7, 7, 0, 0, 4, 0, 4, 7, 0, 4, 0, 4, 1, 4, 4, 0, 0, 0, 4, 1, 4,
       1, 4, 4, 0, 0, 0, 0, 4, 0, 7, 7, 1, 1, 0, 0, 4, 2, 7, 7, 7, 7, 7,
       1, 4, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 7,
       3, 1, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2,

In [38]:
labels = get_labels(merge_Bible=False)

In [39]:
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
print("homogeneity_score = {}".format(homogeneity_score(labels, lab)))
print("completeness_score = {}".format(completeness_score(labels, lab)))
print("v_measure_score = {}".format(v_measure_score(labels, lab)))

homogeneity_score = 0.5023963511990462
completeness_score = 0.4841485155634507
v_measure_score = 0.49310367090697976


Dodajmy Entities 

In [40]:
def get_entities(data):

    def unique(arr): 
    
        un = []

        for elem in arr: 
            if  elem not in un : 
                un.append(elem)

        return un

    entities_list = [[] for i in range(590)] # number of chapters
    for i in range(len(data)):
        chapter = str(data[i])
        doc = nlp(chapter) 

        entities_in_chapter = []
        for ent in doc.ents: 
            entities_in_chapter.append(ent.text)
            
        for j in range(len(entities_in_chapter)) : 
            ent = entities_in_chapter[j]
            ent = "".join(ent.split())
            entities_in_chapter[j] = ent
        
        
        entities_list[i] =  " ".join(entities_in_chapter)
        
            
    out = entities_list
    
    return out

In [41]:
ge = get_entities(data)

In [42]:
cv = CountVectorizer(stop_words='english')
entities_count = cv.fit_transform(ge)
count_vect_entities = pd.DataFrame(entities_count.todense(), columns=cv.get_feature_names())

Jeżeli jakieś entity już się znalazło we wcześniejszej ramce danych, to go nie dodamy.

In [43]:
nice_cols = []
for i in range(len(count_vect_entities.columns)):
    nice_cols.append(count_vect_entities.columns[i] not in list(whole_df.columns))

In [44]:
count_vect_entities.loc[:,nice_cols]

Unnamed: 0,absolute,absolutetruth,achiketas,aday,adorethee,afewyears,ages,ahundred,ahundredyears,allday,...,vinepower,virtue,vomens,whatsoever,windy,wisdomchapter,workmaster,works,yamas,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
586,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Jak można zauważyć kilku członowe entities są sklejone w jeden człon. W końcu słoto to tylko liczba (w macierzy rzadkiej)

In [47]:
absolute_whole_df = pd.concat([whole_df ,count_vect_entities], axis = 1) 
absolute_whole_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,works,xi,xii,yama,yamas,yea,years,yesterday,yoga,zorobabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
df = sparse.csr_matrix(absolute_whole_df)
trans = TfidfTransformer()
x = trans.fit_transform(df)
model = KMeans(n_clusters=8, random_state = 123)
lab2 = model.fit_predict(x)
lab2

array([6, 5, 5, 5, 5, 5, 7, 7, 5, 7, 7, 6, 5, 6, 5, 7, 5, 5, 5, 7, 5, 5,
       7, 7, 7, 7, 6, 7, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 5,
       5, 6, 3, 7, 3, 3, 3, 3, 3, 3, 3, 3, 5, 6, 7, 3, 3, 3, 3, 3, 3, 3,
       3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3,
       6, 3, 3, 3, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 6, 3, 6, 3, 3, 2, 6, 6, 3, 6, 3, 3, 3, 6, 6, 6, 6, 6,
       6, 7, 6, 7, 6, 6, 7, 6, 6, 6, 6, 1, 2, 2, 6, 2, 2, 2, 6, 6, 2, 2,
       2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 6, 6, 6, 2, 2,
       6, 6, 6, 6, 6, 1, 2, 6, 6, 2, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 1, 4, 6, 6, 6, 6, 7, 6, 4, 6,
       4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 1, 6, 4, 4, 4, 4,
       4, 7, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       6, 4, 7, 1, 1, 1, 6, 1, 1, 7, 7, 7, 1, 1, 1,

In [49]:
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
print("homogeneity_score = {}".format(homogeneity_score(labels, lab2)))
print("completeness_score = {}".format(completeness_score(labels, lab2)))
print("v_measure_score = {}".format(v_measure_score(labels, lab2)))

homogeneity_score = 0.6294831591759592
completeness_score = 0.5764558150334008
v_measure_score = 0.6018036324109727


In [50]:
df = sparse.csr_matrix(absolute_whole_df)
trans = TfidfTransformer()
x = trans.fit_transform(df)
model = KMeans(n_clusters=5, random_state = 123)
lab2 = model.fit_predict(x)

labels= get_labels(True)

print("homogeneity_score = {}".format(homogeneity_score(labels, lab2)))
print("completeness_score = {}".format(completeness_score(labels, lab2)))
print("v_measure_score = {}".format(v_measure_score(labels, lab2)))

homogeneity_score = 0.5561627871233038
completeness_score = 0.6410034069721171
v_measure_score = 0.5955768599806133


## Podsumowanie i przyszłe prace
Mamy już narzędzia do tworzenia modeli, pozostało nam wybrać najlepszy poprzez wybrane już metryki. 