In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import textsplit
from textsplit.tools import get_penalty, get_segments, P_k
from textsplit.algorithm import split_optimal, split_greedy, get_total

In [15]:
FILE_PATH = '/Users/sauyee/repos/git/text-segmentation/data/article_1.txt'

# Use pre-trained embeddings

In [2]:
embeddings_tuple = pd.read_pickle('/Users/sauyee/repos/git/text-segmentation/polyglot-en.pkl')

In [3]:
model_df = pd.DataFrame(embeddings_tuple[1], index=embeddings_tuple[0])

In [4]:
model_df.shape

(100004, 64)

# Preprocessing / tokenization

In [5]:
def get_sentence_tokenizer():
    nltk.download('punkt')
    return nltk.data.load('tokenizers/punkt/english.pickle')


In [6]:
sentence_tokenizer = get_sentence_tokenizer()

[nltk_data] Downloading package punkt to /Users/sauyee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def preprocess_text(path):
    with open(path, 'rt') as f:
        text = f.read().replace('Yahoo!', 'Yahoo')\
            .replace('K.K.', 'KK')\
            .replace('\n\n', '\n')\
            .replace('\n', ' ¤')
    return text


def get_sentenced_vectors(text, sentence_analyzer, model_df):
    sentenced_text = sentence_analyzer.tokenize(text)
    vecr = CountVectorizer(vocabulary=model_df.index)
    return vecr.transform(sentenced_text).dot(model_df)

def get_optimal_segmentation_and_text(sentenced_text, sentence_vectors, penalty):
    optimal_segmentation = split_optimal(sentence_vectors, penalty, seg_limit=250)
    # seg_limit is maximum number of sentences in a segment. optional
    segmented_text = get_segments(sentenced_text, optimal_segmentation)
    print('%d sentences, %d segments, avg %4.2f sentences per segment' % (
        len(sentenced_text), len(segmented_text), len(sentenced_text) / len(segmented_text)))
    return optimal_segmentation, segmented_text

In [25]:
text = preprocess_text(FILE_PATH)

In [26]:
text

'SoftBank is launching of a $5 billion fund that will invest in technology start-ups across Latin America, the company announced Thursday. ¤The new fund, named the SoftBank Innovation Fund, will be run by former Sprint CEO and Bolivian native Marcelo Claure. SoftBank has already committed $2 billion to the fund, though no decision has been made yet on where its headquarters will be. ¤The fund’s size is unprecedented in Latin America as it totals the combined venture capital investments of 2017 and 2018, according to Venturesource data cited by The Wall Street Journal. ¤“Latin America is on the cusp of becoming one of the most important economic regions in the world, and we anticipate significant growth in the decades ahead,” said Masayoshi Son, chairman and CEO of SoftBank, in a statement. ¤SoftBank said the new fund will focus investments on industries like e-commerce, health care and digital financial services, among others. The fund will also invest all throughout Latin America, inc

In [27]:
sentenced_text = sentence_tokenizer.tokenize(text)

In [28]:
sentenced_text

['SoftBank is launching of a $5 billion fund that will invest in technology start-ups across Latin America, the company announced Thursday.',
 '¤The new fund, named the SoftBank Innovation Fund, will be run by former Sprint CEO and Bolivian native Marcelo Claure.',
 'SoftBank has already committed $2 billion to the fund, though no decision has been made yet on where its headquarters will be.',
 '¤The fund’s size is unprecedented in Latin America as it totals the combined venture capital investments of 2017 and 2018, according to Venturesource data cited by The Wall Street Journal.',
 '¤“Latin America is on the cusp of becoming one of the most important economic regions in the world, and we anticipate significant growth in the decades ahead,” said Masayoshi Son, chairman and CEO of SoftBank, in a statement.',
 '¤SoftBank said the new fund will focus investments on industries like e-commerce, health care and digital financial services, among others.',
 'The fund will also invest all thro

In [82]:
sentenced_vectors = get_sentenced_vectors(text, sentence_tokenizer, model_df)

In [83]:
sentenced_vectors.shape # 378 sentences

(258, 64)

In [120]:
penalty = get_penalty([sentenced_vectors], segment_len=25)

In [121]:
sentenced_vectors.shape

(258, 64)

In [122]:
penalty

6.024941538420649

In [123]:
optimal_segmentation, segmented_text = get_optimal_segmentation_and_text(sentenced_text, sentenced_vectors, penalty)

258 sentences, 12 segments, avg 21.50 sentences per segment


In [124]:
segmented_text

[['Thank you very much, director sir .',
  'Ladies and Gentlemen!',
  '¤Before my speech, I would like to show you a video which happenning in Hangzhou about our Hangzhou SME Summit ,can I ?',
  '¤  ¤Thank you!',
  '¤  ¤After 3 days, you know, in house meeting and flying all the way here ,and I didn’t feel not quite well today ,but when I see the entrepreneurs ,when I came to the SMES ,I always feel excited.'],
 ['Because when I join the SME Conference, I see from the eyes of the dreams ,the passion ,the hope ; when I join the Fortune 500 Conference, I am the CEO ,I see the numbers ,I see the revenues, I see the KPIs， I see the bloody competition.'],
 ['But among the SMES ,you see "I have a dream ,I want to do something."',
  'and that happens all the time and that make me excited all the time .Today I think last year every people say "Wa, the economic is in trouble ,we are all dead ,what are we going to do ?"',
  'Today everybody seems to be happy and say the economy come back .Is rea

In [125]:
split_ref = []
for i, sentence in enumerate(sentenced_text):
  if '¤' in sentence:
    split_ref.append(i)

In [126]:
print(P_k(split_ref, optimal_segmentation.splits, len(sentenced_text)))

0.4777327935222672


In [127]:
len(split_ref)

10