In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import textsplit
from textsplit.tools import get_penalty, get_segments, P_k
from textsplit.algorithm import split_optimal, split_greedy, get_total

In [88]:
FILE_PATH = '/Users/sauyee/repos/git/text-segmentation/data/article_3.txt'

# Use pre-trained embeddings

In [2]:
embeddings_tuple = pd.read_pickle('/Users/sauyee/repos/git/text-segmentation/polyglot-en.pkl')

In [3]:
model_df = pd.DataFrame(embeddings_tuple[1], index=embeddings_tuple[0])

In [4]:
model_df.shape

(100004, 64)

# Preprocessing / tokenization

In [5]:
def get_sentence_tokenizer():
    nltk.download('punkt')
    return nltk.data.load('tokenizers/punkt/english.pickle')


In [6]:
sentence_tokenizer = get_sentence_tokenizer()

[nltk_data] Downloading package punkt to /Users/sauyee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
def preprocess_text(path):
    with open(path, 'rt') as f:
        text = f.read().replace('Yahoo!', 'Yahoo')\
            .replace('K.K.', 'KK')\
            .replace('\n\n', '\n')\
            .replace('\n', ' ¤')
    return text

def get_sentenced_vectors(text, sentence_analyzer, model_df):
    sentenced_text = sentence_analyzer.tokenize(text)
    vecr = CountVectorizer(vocabulary=model_df.index)
    return vecr.transform(sentenced_text).dot(model_df)

def get_optimal_segmentation_and_text(sentenced_text, sentence_vectors, penalty):
    optimal_segmentation = split_optimal(sentence_vectors, penalty, seg_limit=250)
    # seg_limit is maximum number of sentences in a segment. optional
    segmented_text = get_segments(sentenced_text, optimal_segmentation)
    print('%d sentences, %d segments, avg %4.2f sentences per segment' % (
        len(sentenced_text), len(segmented_text), len(sentenced_text) / len(segmented_text)))
    return optimal_segmentation, segmented_text

In [89]:
text = preprocess_text(FILE_PATH)

In [90]:
text

'Money talks in the startup community, especially when SoftBank comes knocking with the megabucks of its Vision Fund. ¤Despite the public outcry around the firm’s dependence on money from Saudi Arabia in the wake of that country’s assassination of Washington Post journalist Jamal Khashoggi, deal flow for SoftBank’s  Vision Fund appears to be back to normal. ¤The $100 billion megafund has done 21 deals over the last two quarters; that’s more than in the other quarters of the previous year combined, according to data from Crunchbase, thanks to an uptick from Asia. Since the October 2 murder, there have been 11 investments in U.S. companies, seven in Asia, two in Europe and one in Latin America. Just this week, the fund completed a near $1.5 billion investment in Southeast Asia-based ride-hailing company Grab. ¤While U.S. and European firms have more options, and therefore, perhaps deserve more scrutiny, SoftBank’s cash is increasingly the only game in town for startups in Asia, where the

In [91]:
sentenced_text = sentence_tokenizer.tokenize(text)

In [92]:
sentenced_text

['Money talks in the startup community, especially when SoftBank comes knocking with the megabucks of its Vision Fund.',
 '¤Despite the public outcry around the firm’s dependence on money from Saudi Arabia in the wake of that country’s assassination of Washington Post journalist Jamal Khashoggi, deal flow for SoftBank’s  Vision Fund appears to be back to normal.',
 '¤The $100 billion megafund has done 21 deals over the last two quarters; that’s more than in the other quarters of the previous year combined, according to data from Crunchbase, thanks to an uptick from Asia.',
 'Since the October 2 murder, there have been 11 investments in U.S. companies, seven in Asia, two in Europe and one in Latin America.',
 'Just this week, the fund completed a near $1.5 billion investment in Southeast Asia-based ride-hailing company Grab.',
 '¤While U.S. and European firms have more options, and therefore, perhaps deserve more scrutiny, SoftBank’s cash is increasingly the only game in town for startu

In [93]:
sentenced_vectors = get_sentenced_vectors(text, sentence_tokenizer, model_df)

In [94]:
split_ref = []
for i, sentence in enumerate(sentenced_text):
  if '¤' in sentence:
    split_ref.append(i)

In [95]:
print("Number of senteces:", sentenced_vectors.shape[0])
print("Number of paragraphs in ref text", len(split_ref))
print("Average number of sentences per paragraph", sentenced_vectors.shape[0]/len(split_ref))

Number of senteces: 40
Number of paragraphs in ref text 19
Average number of sentences per paragraph 2.1052631578947367


In [105]:
penalty = get_penalty([sentenced_vectors], segment_len=2)

In [106]:
penalty

2.7956890883501444

In [107]:
optimal_segmentation, segmented_text = get_optimal_segmentation_and_text(sentenced_text, sentenced_vectors, penalty)

40 sentences, 22 segments, avg 1.82 sentences per segment


In [108]:
segmented_text

[['Money talks in the startup community, especially when SoftBank comes knocking with the megabucks of its Vision Fund.',
  '¤Despite the public outcry around the firm’s dependence on money from Saudi Arabia in the wake of that country’s assassination of Washington Post journalist Jamal Khashoggi, deal flow for SoftBank’s  Vision Fund appears to be back to normal.'],
 ['¤The $100 billion megafund has done 21 deals over the last two quarters; that’s more than in the other quarters of the previous year combined, according to data from Crunchbase, thanks to an uptick from Asia.',
  'Since the October 2 murder, there have been 11 investments in U.S. companies, seven in Asia, two in Europe and one in Latin America.'],
 ['Just this week, the fund completed a near $1.5 billion investment in Southeast Asia-based ride-hailing company Grab.'],
 ['¤While U.S. and European firms have more options, and therefore, perhaps deserve more scrutiny, SoftBank’s cash is increasingly the only game in town f

In [120]:
# check k val
N = len(sentenced_text)
k = round(N / (len(split_ref) + 1) / 2 - 1)
print(k)

0


In [109]:
print(P_k(split_ref, optimal_segmentation.splits, len(sentenced_text)))

0.0


In [122]:
k = 1
ref = np.array(split_ref, dtype=np.int32)
hyp = np.array(optimal_segmentation.splits, dtype=np.int32)

def is_split_between(splits, l, r):
    return np.sometrue(np.logical_and(splits - l >= 0, splits - r < 0))

acc = 0
for i in range(N-k):
    acc += is_split_between(ref, i, i+k) != is_split_between(hyp, i, i+k)

print(acc / (N-k))

0.48717948717948717


In [110]:
print("Number of paragraphs in hyp text", len(optimal_segmentation.splits))

Number of paragraphs in hyp text 21


In [111]:
optimal_segmentation.splits

[2, 4, 5, 6, 9, 10, 11, 12, 14, 15, 17, 18, 20, 22, 23, 25, 26, 34, 36, 37, 38]

In [112]:
split_ref

[1, 2, 5, 6, 8, 11, 13, 14, 16, 17, 20, 22, 24, 25, 31, 32, 35, 38, 39]