### Load file `input2.csv`

In [1]:
import pandas as pd

df = pd.read_csv("input2.csv")

### Load pretrained model Word2Vec

In [2]:
from gensim.models import Word2Vec

model = Word2Vec.load("idwiki_word2vec_200/idwiki_word2vec_200.model")

### Create WordVector from model

Source: https://stackoverflow.com/questions/46885454/how-to-create-a-dataframe-with-the-word2ve-vectors-as-data-and-the-terms-as-row

In [3]:
ordered_vocab = [(v, model.wv.key_to_index[v], model.wv.get_vecattr(v, "count")) for v in model.wv.index_to_key]
ordered_vocab = sorted(ordered_vocab, key=lambda k: k[2])
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
wordvec = pd.DataFrame(model.wv.vectors[term_indices, :], index=ordered_terms)

### Create CountVectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(vocabulary=wordvec.index)

### Segmentation using the optimal algorithm

In [22]:
from textsplit.tools import get_penalty, get_segments
from textsplit.algorithm import split_optimal, split_greedy

segment_len = 3

out = {
    "doc_id": [],
    "segment_id": [],
    "sentence_id": [],
}

for doc_id in range(500):
    sentence_text = []
    for _, row in df.iterrows():
        if row["doc_id"] == doc_id:
            sentence_text.append(row["sentence_text"])
    
    sentence_vec = count_vec.transform(sentence_text).dot(wordvec)
    
    try:
        penalty = get_penalty([sentence_vec], segment_len)
    except ValueError:
        print(f"doc_id {doc_id} is too short ({len(sentence_text)})")
        continue
    
    optimal_segmentation = split_optimal(sentence_vec, penalty, seg_limit=250)
    # segment_text = get_segments(sentence_text, optimal_segmentation)
    
    # uncomment when processing using greedy method
    greedy_segmentation = split_greedy(sentence_vec, max_splits=len(optimal_segmentation.splits))
    greedy_segment_text = get_segments(sentence_text, greedy_segmentation)
    
    sent_id = 0
    
    for seg_id, seg in enumerate(greedy_segment_text):
        for sentence in seg:
            out["doc_id"].append(doc_id)
            out["segment_id"].append(f"s{doc_id}_{seg_id}")
            out["sentence_id"].append(f"{doc_id}_{sent_id}")
            sent_id += 1

doc_id 87 is too short (2)
doc_id 269 is too short (5)


### Export the output to CSV

In [21]:
# OPTIMAL
from datetime import date

today = date.today()

out_df = pd.DataFrame(out)
out_df.to_csv(f"textsplit_optimal_output_{today.strftime('%d%m%Y')}.csv", index=False)

In [23]:
out_df = pd.DataFrame(out)
out_df.to_csv(f"textsplit_greedy_output_{today.strftime('%d%m%Y')}.csv", index=False)