In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import textsplit
from textsplit.tools import get_penalty, get_segments, P_k
from textsplit.algorithm import split_optimal, split_greedy, get_total

In [15]:
FILE_PATH = '/Users/sauyee/repos/git/text-segmentation/data/article_1.txt'

# Use pre-trained embeddings

In [2]:
embeddings_tuple = pd.read_pickle('/Users/sauyee/repos/git/text-segmentation/polyglot-en.pkl')

In [3]:
model_df = pd.DataFrame(embeddings_tuple[1], index=embeddings_tuple[0])

In [4]:
model_df.shape

(100004, 64)

# Preprocessing / tokenization

In [5]:
def get_sentence_tokenizer():
    nltk.download('punkt')
    return nltk.data.load('tokenizers/punkt/english.pickle')


In [6]:
sentence_tokenizer = get_sentence_tokenizer()

[nltk_data] Downloading package punkt to /Users/sauyee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def preprocess_text(path):
    with open(path, 'rt') as f:
        text = f.read().replace('Yahoo!', 'Yahoo')\
            .replace('K.K.', 'KK')\
            .replace('\n\n', '\n')\
            .replace('\n', ' ¤')
    return text


def get_sentenced_vectors(text, sentence_analyzer, model_df):
    sentenced_text = sentence_analyzer.tokenize(text)
    vecr = CountVectorizer(vocabulary=model_df.index)
    return vecr.transform(sentenced_text).dot(model_df)

def get_optimal_segmentation_and_text(sentenced_text, sentence_vectors, penalty):
    optimal_segmentation = split_optimal(sentence_vectors, penalty, seg_limit=250)
    # seg_limit is maximum number of sentences in a segment. optional
    segmented_text = get_segments(sentenced_text, optimal_segmentation)
    print('%d sentences, %d segments, avg %4.2f sentences per segment' % (
        len(sentenced_text), len(segmented_text), len(sentenced_text) / len(segmented_text)))
    return optimal_segmentation, segmented_text

In [25]:
text = preprocess_text(FILE_PATH)

In [26]:
text

'SoftBank is launching of a $5 billion fund that will invest in technology start-ups across Latin America, the company announced Thursday. ¤The new fund, named the SoftBank Innovation Fund, will be run by former Sprint CEO and Bolivian native Marcelo Claure. SoftBank has already committed $2 billion to the fund, though no decision has been made yet on where its headquarters will be. ¤The fund’s size is unprecedented in Latin America as it totals the combined venture capital investments of 2017 and 2018, according to Venturesource data cited by The Wall Street Journal. ¤“Latin America is on the cusp of becoming one of the most important economic regions in the world, and we anticipate significant growth in the decades ahead,” said Masayoshi Son, chairman and CEO of SoftBank, in a statement. ¤SoftBank said the new fund will focus investments on industries like e-commerce, health care and digital financial services, among others. The fund will also invest all throughout Latin America, inc

In [27]:
sentenced_text = sentence_tokenizer.tokenize(text)

In [28]:
sentenced_text

['SoftBank is launching of a $5 billion fund that will invest in technology start-ups across Latin America, the company announced Thursday.',
 '¤The new fund, named the SoftBank Innovation Fund, will be run by former Sprint CEO and Bolivian native Marcelo Claure.',
 'SoftBank has already committed $2 billion to the fund, though no decision has been made yet on where its headquarters will be.',
 '¤The fund’s size is unprecedented in Latin America as it totals the combined venture capital investments of 2017 and 2018, according to Venturesource data cited by The Wall Street Journal.',
 '¤“Latin America is on the cusp of becoming one of the most important economic regions in the world, and we anticipate significant growth in the decades ahead,” said Masayoshi Son, chairman and CEO of SoftBank, in a statement.',
 '¤SoftBank said the new fund will focus investments on industries like e-commerce, health care and digital financial services, among others.',
 'The fund will also invest all thro

In [29]:
sentenced_vectors = get_sentenced_vectors(text, sentence_tokenizer, model_df)

In [32]:
print("Number of senteces:", sentenced_vectors.shape[0])

Number of senteces: 10


In [35]:
penalty = get_penalty([sentenced_vectors], segment_len=5)

In [36]:
sentenced_vectors.shape

(10, 64)

In [37]:
penalty

3.9755154891134907

In [38]:
optimal_segmentation, segmented_text = get_optimal_segmentation_and_text(sentenced_text, sentenced_vectors, penalty)

10 sentences, 4 segments, avg 2.50 sentences per segment


In [39]:
segmented_text

[['SoftBank is launching of a $5 billion fund that will invest in technology start-ups across Latin America, the company announced Thursday.'],
 ['¤The new fund, named the SoftBank Innovation Fund, will be run by former Sprint CEO and Bolivian native Marcelo Claure.',
  'SoftBank has already committed $2 billion to the fund, though no decision has been made yet on where its headquarters will be.'],
 ['¤The fund’s size is unprecedented in Latin America as it totals the combined venture capital investments of 2017 and 2018, according to Venturesource data cited by The Wall Street Journal.',
  '¤“Latin America is on the cusp of becoming one of the most important economic regions in the world, and we anticipate significant growth in the decades ahead,” said Masayoshi Son, chairman and CEO of SoftBank, in a statement.'],
 ['¤SoftBank said the new fund will focus investments on industries like e-commerce, health care and digital financial services, among others.',
  'The fund will also inves

In [40]:
split_ref = []
for i, sentence in enumerate(sentenced_text):
  if '¤' in sentence:
    split_ref.append(i)

In [46]:
print(P_k(split_ref, optimal_segmentation.splits, len(sentenced_text)))

0.0


In [43]:
print("Number of paragraphs in ref text", len(split_ref))

Number of paragraphs in ref text 7


In [45]:
print("Number of paragraphs in hyp text", len(optimal_segmentation.splits))

Number of paragraphs in hyp text 3


In [47]:
optimal_segmentation.splits

[1, 3, 5]

In [48]:
split_ref

[1, 3, 4, 5, 7, 8, 9]