In [51]:
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
from nltk import bigrams
from math import log 
import heapq
from operator import itemgetter
import statistics

Давайте попробуем проанализировать роман Джейн Остин "Эмма"
Возьмём только предложения из корпуса проекта Гутенберг.

In [71]:
emma = gutenberg.sents('austen-emma.txt')
emma_sents = []
def filterFunc(a):
    v = ['[',']','--']
    return not (a in v)

for sent in emma:
    emma_sents.append(list(filter(filterFunc, sent)))
# Term frequencies cache
D = {}

Будем смотреть на среднее значение tf-idf.

In [75]:
def tf(term, sentence):
    return sentence.count(term)/len(sentence)

def idf(term, documents):
    N = len(documents)
    d = D.get(term)
    if d==None:
        d = sum([1 if d.count(term)>0 else 0 for d in documents])
        D[term]=d
    return log(N/d)

def tf_idf(term, sentence, documents):
    return tf(term, sentence)*idf(term, documents)

def process_sentence(s):
    if len(s)<2:
        return 0
    terms = set(s)
    mean_ti = statistics.mean([tf_idf(w, s, emma_sents) for w in terms])
    return mean_ti

In [76]:
def calculate_means():
    mean_tis = []
    for i, s in enumerate(emma_sents):
        mean_tis.append((i, process_sentence(s)))
    return mean_tis
N = 5
def print_top(m):
    top = heapq.nlargest(N, m, itemgetter(1))
    top_indexes = [i[0] for i in top]
    for i in sorted(top_indexes):
        print(emma_sents[i])

In [77]:
print_top(calculate_means())

['CHAPTER', 'IV']
['CHAPTER', 'V']
['VOLUME', 'II']
['VOLUME', 'III']
['CHAPTER', 'XIX']


Ха! Ну что ж, разумно. Попробуем удалить названия глав и частей.

In [80]:
index_to_remove = []
#Also remove author name 
index_to_remove.append(0)
for i, sent in enumerate(emma_sents):
    if len(sent)<1:
        continue
    if sent[0] in ['CHAPTER', 'VOLUME', 'FINIS']:
        index_to_remove.append(i)
        
for idx in reversed(index_to_remove):
    del emma_sents[idx]

In [81]:
print_top(calculate_means())

['Nonsense', '!']
['Alas', '!']
['Heavens', '!']
['Alas', '!']
['Weston', '.]']


Ага! Средняя tf-idf мало чего нам даёт т.к. не учитывает длинну самого предложения. Попробуем считать суммы.

In [82]:
def process_sentence(s):
    terms = set(s)
    mean_ti = sum([tf_idf(w, s, emma_sents) for w in terms])
    return mean_ti

In [90]:
for el in print_top(calculate_means()):
    print(" ".join(el))

['Former', 'provocations', 'reappeared', '.']
['Absolutely', 'insufferable', '!']
['then', ',', 'don', "'", 't', 'speak', 'it', ',', 'don', "'", 't', 'speak', 'it', ',"', 'she', 'eagerly', 'cried', '.']
['MY', 'DEAR', 'MADAM', ',']
['"`', 'Smallridge', "!'"]


TypeError: 'NoneType' object is not iterable

Это уже похоже на роман нравов. Давайте провернём ещё одну итерацию и добавим биграммы.

In [84]:
def bigam_count(sentense, term):
    count=0
    for i, el in enumerate(sentense):
        if el==term[0]:
            if i < len(sentense)-1 and sentense[i+1]==term[1]:
                count+=1
    return count

def btf(term, sentence):
    return bigam_count(sentence, term)/len(list(bigrams(sentence)))

def bidf(term, documents):
    N = len(documents)
    d = D.get(term)
    if d==None:
        d = sum([1 if bigam_count(d, term)>0 else 0 for d in documents])
        D[term]=d
    return log(N/d)

def btf_idf(term, sentence, documents):
    return btf(term, sentence)*bidf(term, documents)

In [85]:
def process_sentence(s):
    if len(s)<2:
        return 0
    terms = set(s)
    mean_ti = sum([tf_idf(w, s, emma_sents) for w in terms])
    mean_ti += sum([btf_idf(bi, s, emma_sents) for bi in bigrams(s)])
    return mean_ti

In [86]:
print_top(calculate_means())

['Former', 'provocations', 'reappeared', '.']
['Absolutely', 'insufferable', '!']
['then', ',', 'don', "'", 't', 'speak', 'it', ',', 'don', "'", 't', 'speak', 'it', ',"', 'she', 'eagerly', 'cried', '.']
['MY', 'DEAR', 'MADAM', ',']
['"`', 'Smallridge', "!'"]


К сожалению, в моём корпусе некоторые предложения были разбиты пополам но то, что выдача выглядит более информативной очевидно!
Интересно посмотреть, какие токены получили самый высокий скор.

In [89]:
sorted(D.items(), key=itemgetter(1), reverse=True)[:10]

[('.', 5509),
 (',', 4499),
 ('to', 3120),
 ('and', 2721),
 ('the', 2697),
 ('of', 2655),
 ('a', 2172),
 ('I', 2116),
 ('not', 1769),
 ('was', 1744)]

In [106]:
filter(lambda x: isinstance(p, tuple), sorted(D.items(), key=itemgetter(1), reverse=True))

TypeError: 'filter' object is not subscriptable