# This shows that the string encoding work correctly

In [1]:
from collections import Counter
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import regex

In [2]:
# Precompile regular expressions
# Based on https://github.com/yoonkim/CNN_sentence/blob/23e0e1f7355705bb083043fda05c031b15acb38c/process_data.py#L97
RE_NOT_ALNUM = regex.compile(r"[^A-Za-z0-9(),!?'`]")
RE_CONTRACTION = regex.compile(r"'([std]|ve|re|ll)")
RE_PUNCTUATION = regex.compile(r"([(),!?'`])")
RE_WHITESPACE = regex.compile(r"\s{2,}")

def clean_str(text):
    text = RE_NOT_ALNUM.sub(" ", text)
    text = RE_CONTRACTION.sub(" '\\1", text)
    text = RE_PUNCTUATION.sub(" \\1 ", text)
    # TODO: remove stop words here?
    text = RE_WHITESPACE.sub(" ", text)
    return text.strip().lower().split()

def generate_word_freq(*series: pd.Series, max_n: Optional[int] = None) -> List[Tuple[str, int]]:
    word_counts = Counter()
    for s in series:
        for row in s:
            tokens = clean_str(row)
            word_counts.update(tokens)
    return word_counts.most_common(max_n)

def generate_word_to_idx(*series: pd.Series, max_n: Optional[int] = None) -> Dict[str, int]:
    word_freq = generate_word_freq(*series, max_n=max_n)
    return {word: idx for idx, (word, _) in enumerate(word_freq)}

def bag_of_words(sentences: pd.Series, word_to_idx: Dict[str, int]):
    sentences = sentences.apply(clean_str)
    # lil_matrix not supported, is flattened in TMU
    # Using np.uint32 as it's the same in TMU
    matrix = np.zeros((len(sentences), len(word_to_idx)), dtype=np.uint32)
    for i, l in enumerate(sentences):
        for w in l:
            if w in word_to_idx:
                matrix[i, word_to_idx[w]] = 1
    return matrix

In [3]:
df = pd.DataFrame({"text": [
    "This is the first sentence.",
    "The second sentence is here.",
    "Sentence number three is long.",
    "The fourth sentence is short.",
    "This is the last sentence in the set.",
    "Another set of sentences begins here.",
    "This is the second sentence in the new set.",
    "The third sentence is the longest one yet.",
    "A fourth sentence is here.",
    "The last sentence a a a in this set is quite short."
]})
df

Unnamed: 0,text
0,This is the first sentence.
1,The second sentence is here.
2,Sentence number three is long.
3,The fourth sentence is short.
4,This is the last sentence in the set.
5,Another set of sentences begins here.
6,This is the second sentence in the new set.
7,The third sentence is the longest one yet.
8,A fourth sentence is here.
9,The last sentence a a a in this set is quite ...


In [4]:
word_to_idx = generate_word_to_idx(df["text"], max_n=10)
word_to_idx

{'the': 0,
 'is': 1,
 'sentence': 2,
 'this': 3,
 'set': 4,
 'a': 5,
 'here': 6,
 'in': 7,
 'second': 8,
 'fourth': 9}

In [5]:
matrix = bag_of_words(df["text"], word_to_idx)
matrix

array([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 1, 0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 1],
       [1, 1, 1, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 1, 1, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 1, 1, 0, 0, 1],
       [1, 1, 1, 1, 1, 1, 0, 1, 0, 0]], dtype=uint32)