In [212]:
import pandas as pd
import numpy as np
import scipy as sp
import math
import matplotlib.pyplot as plt
import locale
import nltk
import re
from collections import Counter, defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from IPython.display import display
from dataclasses import dataclass
from operator import itemgetter
nltk.download("stopwords")
nltk.download("punkt")
locale.setlocale( locale.LC_ALL, 'en_GB.UTF-8') 
plt.rcParams["figure.figsize"] = (16,9)

[nltk_data] Downloading package stopwords to /Users/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [117]:
texts_df = pd.read_json("data/window_explorations.json", orient='records')

In [146]:
@dataclass
class Text:
    words : list
    excess : list
        
@dataclass
class Word:
    word : str
    weight : float
    original_position: list      

In [194]:
def create_text(string):
    return Text([
        Word(string, 1.0, [0])
    ], [])

def chain_text_transformations(transformations):
    def inner(text):
        results = [text]
        current = text
        for transformation in transformations:
            result = transformation(current)
            results.append(result)
            current = result
        return results        
    return inner

In [203]:
def regex_split_and_match(regex, text):
    result = []
    length = len(text)
    last_found = 0
    for match in re.finditer(rex, texto):
        if match.start() > last_found:
            result.append((text[last_found:match.start()], False))
        result.append((match.group(0), True))
        last_found = match.end()
    if last_found < length:
        result.append((text[last_found:length], False))
    return result

def transform_regex_exclude(regex):
    def inner(text):
        transformed = []
        excesses = []
        for(i, word) in enumerate(text.words):
            matches = regex_split_and_match(regex, word.word)
            n_tokens = len(matches) - sum(map(itemgetter(1), matches))
            for (i, (token, matched)) in enumerate(matches):
                if(matched):
                    excesses.append(Word(token, 0 , word.original_position + [i]))
                else:
                    transformed.append(Word(token, word.weight / n_tokens, word.original_position + [i]))
        return Text(transformed, excesses)
    return inner


def downcase(text):
    


split_on_spaces = transform_regex_exclude(r"\s")                    

remove_punctuation = transform_regex_exclude(r"[^\w]")


pipe = chain_text_transformations([split_on_spaces])

results = pipe(create_text(text))

for result in results:
    display_text(result)
    display("---------")
[word.word for word in results[-1].words]

Unnamed: 0,0
word,Él estará en Logroño cuando tu estés con los Ñ...
weight,1.0
original_position,[0]


'---------'

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
word,Él,estará,en,Logroño,cuando,tu,estés,con,los,Ñandúes,Los,Ñandúes,de,Ñuñez,Sí
weight,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
original_position,"[0, 0]","[0, 2]","[0, 4]","[0, 6]","[0, 8]","[0, 10]","[0, 12]","[0, 14]","[0, 16]","[0, 18]","[0, 20]","[0, 22]","[0, 24]","[0, 26]","[0, 28]"


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
word,,,,,,,,,,. ¿,,,,? ¡,!
weight,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
original_position,"[0, 1]","[0, 3]","[0, 5]","[0, 7]","[0, 9]","[0, 11]","[0, 13]","[0, 15]","[0, 17]","[0, 19]","[0, 21]","[0, 23]","[0, 25]","[0, 27]","[0, 29]"


'---------'

['Él',
 'estará',
 'en',
 'Logroño',
 'cuando',
 'tu',
 'estés',
 'con',
 'los',
 'Ñandúes',
 'Los',
 'Ñandúes',
 'de',
 'Ñuñez',
 'Sí']

In [137]:
def split_on_spaces(text):
    spaces = r"\s"
    transformed = []
    excesses = []
    for (i, word) in enumerate(text.words):
        split_text = re.split(spaces, word.word)
        split_excess = re.findall(spaces, word.word)
        new_weight = word.weight / float(len(split_text))
        transformed.extend([Word(transformed, new_weight, word.original_position + [i*2]) for (i, transformed) in enumerate(split_text)])
        excesses.extend([Word(excess, 0.0, word.original_position+[i*2+1]) for (i, excess) in enumerate(split_excess)])
    return Text(transformed, excesses)

def remove_punctuation(text):
    punc_regex = r"[^\w]"
    for (i, word) in enumerate(text.words):
        

def weight_by_word_frequency(corpus):
    def inner(text):
        pass
    
def remove_words(words):
    def inner(text)
        pass

In [138]:
text = create_text("I am a long text that will split onto multiple lines i imagine. How will that work when we see it? Who knows what will happen. one two three four five six")
text = split_on_spaces(text)
display_text(text)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
word,I,am,a,long,text,that,will,split,onto,multiple,...,knows,what,will,happen.,one,two,three,four,five,six
weight,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,...,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125
original_position,"[0, 0]","[0, 2]","[0, 4]","[0, 6]","[0, 8]","[0, 10]","[0, 12]","[0, 14]","[0, 16]","[0, 18]",...,"[0, 44]","[0, 46]","[0, 48]","[0, 50]","[0, 52]","[0, 54]","[0, 56]","[0, 58]","[0, 60]","[0, 62]"


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
word,,,,,,,,,,,...,,,,,,,,,,
weight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
original_position,"[0, 1]","[0, 3]","[0, 5]","[0, 7]","[0, 9]","[0, 11]","[0, 13]","[0, 15]","[0, 17]","[0, 19]",...,"[0, 43]","[0, 45]","[0, 47]","[0, 49]","[0, 51]","[0, 53]","[0, 55]","[0, 57]","[0, 59]","[0, 61]"


In [139]:
def words2df(words):
    return pd.DataFrame([{"word": word.word, "weight": word.weight, "original_position": word.original_position} for word in words])
def display_text(text):
    display(words2df(text.words).T)
    display(words2df(text.excess).T)

In [140]:
text = create_text("Yo estoy aquí en el matadero con la computadora y la botella")
display_text(text)
text = split_on_spaces(text)
display_text(text)

Unnamed: 0,0
word,Yo estoy aquí en el matadero con la computador...
weight,1.0
original_position,[0]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
word,Yo,estoy,aquí,en,el,matadero,con,la,computadora,y,la,botella
weight,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333
original_position,"[0, 0]","[0, 2]","[0, 4]","[0, 6]","[0, 8]","[0, 10]","[0, 12]","[0, 14]","[0, 16]","[0, 18]","[0, 20]","[0, 22]"


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
word,,,,,,,,,,,
weight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
original_position,"[0, 1]","[0, 3]","[0, 5]","[0, 7]","[0, 9]","[0, 11]","[0, 13]","[0, 15]","[0, 17]","[0, 19]","[0, 21]"


In [162]:
import re
text = "Él estará en Logroño cuando tu estés con los Ñandúes. ¿Los Ñandúes de Ñuñez? ¡Sí!"
rex = r"[^\w]+"
words = []
excess = []
length = len(texto)
last_found = 0
for match in re.finditer(rex, texto):
    if match.start() > last_found:
        words.append(text[last_found:match.start()])
    excess.append(match.group(0))
    last_found = match.end()
if last_found < length:
    text.append[last_found:length]
display(words)
display(excess)

['Él',
 'estará',
 'en',
 'Logroño',
 'cuando',
 'tu',
 'estés',
 'con',
 'los',
 'Ñandúes',
 'Los',
 'Ñandúes',
 'de',
 'Ñuñez',
 'Sí']

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '. ¿', ' ', ' ', ' ', '? ¡', '!']

In [206]:
texts = pd.read_json("data/window_explorations.json", orient="records")

In [209]:
texts["value"] = 1.0

In [210]:
texts

Unnamed: 0,text,value
0,On the right i can see a lake with boats. Behi...,1.0
1,"It's a still and sleepy new year's Day, with j...",1.0
2,"a steep rocky crevice, each hold so familiar t...",1.0
3,"Die Nacht ist bereits hereingebrochen, die Ste...",1.0
4,I'm at the end of gravel covered breakwater wi...,1.0
...,...,...
328,The sky today is a clear light blue and mostly...,1.0
329,"The driveway is clean, few leaves on the groun...",1.0
330,beautiful,1.0
331,The snow has just melted this week. There is ...,1.0


In [215]:
texts_as_words = []
vocabulary = Counter()
for (_, row) in texts.iterrows():
    text = row.text
    words = regex_split_and_match(r"\s", text)
    print(words)

[('On', False), (' ', True), ('the ri', False), (' ', True), ('ht', False), (' ', True), ('i can s', False), (' ', True), ('e a la', False), (' ', True), ('e ', False), (' ', True), ('ith b', False), (' ', True), ('ats', False), (' ', True), (' Be', False), (' ', True), ('ind it ', False), ('. ¿', True), ('re ', False), (' ', True), ('re moun', False), (' ', True), ('ai', False), (' ', True), ('s wit', False), ('? ¡', True), ('or', False), ('!', True), ('st on top. On the left there is a Rockwall. ', False)]
[('It', False), (' ', True), ('s a st', False), (' ', True), ('ll', False), (' ', True), ('and sle', False), (' ', True), ('py new', False), (' ', True), ('ye', False), (' ', True), ("r's D", False), (' ', True), ('y, ', False), (' ', True), ('ith', False), (' ', True), ('just a ', False), ('. ¿', True), ('ath', False), (' ', True), ('of wind', False), (' ', True), ('an', False), (' ', True), (' the ', False), ('? ¡', True), ('as', False), ('!', True), ('onal pedestrian and car.', 