In [1]:
import pandas as pd
import nltk
import numpy as np
import numpy.ma as ma
from nltk.tokenize import word_tokenize

In [2]:
import sys
sys.path.append('/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages')

import spacy
import nltk
import stanza

In [3]:
def tag_seq(seq, model_type, model=None):
    
    if model_type == 'spacy':
        if not model:
            model = spacy.load("en_core_web_sm")
        return [(token.text, token.pos_, token.lemma_) for token in model(seq)]

class MorphTagger():
    
    """
    a class for morhological tagging of English text
    args:
        model -- callable which tags a sequence (spacy/nltk/stanza)
        model_type -- string in ['spacy', 'nltk', 'stanza']
    """
    
    def __init__(self, 
                 model_type,
                 model=None):
        
        self.model = model
        self.model_type = model_type
        
    def load_model(self):
        
        if not self.model:
            if self.model_type == 'spacy':
                self.model = spacy.load("en_core_web_sm")
            elif self.model_type == 'stanza':
                self.model = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')
        
    def tag_seq(self, seq, text_only=True):
        
        if text_only:
            return [token.text for token in self.model(seq)]
        else:
            return [token for token in self.model(seq)]
        
    def predict(self, seq):
        
        return [token[1] for token in tag_seq(seq, self.model_type, self.model)]

In [20]:
import re

class LinearRules():
    
    def __init__(self, 
                 tagger, 
                 neg_markers=['not', 'no'],
                 neg_position=2,
                 pst_position=3):
        
        self.tagger = tagger
        self.tagger.load_model()
        self.neg_markers = neg_markers
        self.neg_position = neg_position
        self.pst_position = pst_position
        
    def shift_negation(self, sent, as_list=False):
        
        tg_sent = tagger.tag_seq(sent, text_only=False)
        is_neg = lambda token: str(token.morph) == 'Polarity=Neg'
        neg_mask = np.array([is_neg(token) for token in tg_sent])
        if neg_mask.sum() == 0:
            return None
        lemmatize = lambda token: token.text if token.lemma_ not in self.neg_markers else None
        
        lm_sent = [lemmatize(token) for token in tg_sent if lemmatize(token)]
        position = min(self.pst_position, len(lm_sent)-1)
        if position != len(lm_sent)-1:
            while (tg_sent[position].pos_ == 'PUNCT') or (tg_sent[position].text == '\n') and (position < len(lm_sent)-1):
                position += 1
            lm_sent = lm_sent[:position] + ['not'] + lm_sent[position:]
        
        if as_list:
            return lm_sent
        else:
            return ' '.join(lm_sent)    
        
    def question_reverse(self, sent):
        if len(sent) >= 1:
            if sent[-1] == '?' and re.match(r'\w', sent):
                tr_sent = tagger.tag_seq(sent, text_only=True)[-2::-1] + ['?']
                return ' '.join(tr_sent)
            else:
                return None
        
    
    def shift_past(self, sent, as_list=False):
        
        tg_sent = tagger.tag_seq(sent, text_only=False)
        is_pst_verb = lambda token: str(token.morph) == 'Tense=Past|VerbForm=Fin'
        pst_mask = np.array([is_pst_verb(token) for token in tg_sent])
        if pst_mask.sum() == 0:
            return None
        
        position = min(self.pst_position, len(tg_sent)-1)
        pst_positions = np.arange(len(tg_sent))[pst_mask]
        while ((tg_sent[position].pos_ == 'PUNCT') or (tg_sent[position].text == '\n')) \
            and (position < len(tg_sent)-1):
            position += 1
                    
        def lemmatize(token):
            if str(token.morph) == 'Tense=Past|VerbForm=Fin':
                if token.i == position:
                    return token.text
                else:
                    return token.lemma_
            else:
                if token.i == position:
                    return token.lemma_ + 'ed'
                else:
                    return token.text
                    
        lm_sent = [lemmatize(token) for token in tg_sent]
        
        if as_list:
            return lm_sent
        else:
            return ' '.join(lm_sent)

In [22]:
tagger = MorphTagger('spacy')
rules = LinearRules(tagger)
s = 'i died soon and didn\'t i want burgers?'
s = rules.shift_past(s)
print(s)
s = rules.shift_negation(s)
print(s)
s = rules.question_reverse(s)
print(s)
s
# rules.question_reverse('a((?')
# sent_tokenize('a? b.')

i die soon anded do n't i want burgers ?
i die soon not anded do i want burgers ?
burgers want i do anded not soon die i ?


'burgers want i do anded not soon die i ?'

In [32]:
from tqdm import tqdm 
tqdm.pandas()
import pandas as pd
import nltk
import numpy as np
import numpy.ma as ma
from nltk.tokenize import sent_tokenize
import re
import argparse
import os


def process_text(text, f):
    
    res = ''
    # tokenize into sents
    sents = sent_tokenize(text)
    # split headers from body text
    sents = sum([sent.split('\n') for sent in sents], [])
    for sent in sents:
        tr_sent = f(sent)
        if tr_sent:
            res += tr_sent + '\n'
            
    # only return contentful results
    if res != []:
        return res
    else: 
        return None
    
def process_file(fn, func, output_path):
    
    data = pd.read_parquet(fn)
    processed_text = data['text'][:10].progress_apply(lambda x: process_text(x, func) if process_text(x, func) else x)
    print(processed_text)
    pd.DataFrame({
        'processed_text': processed_text,
        'text': data['text']
    }).to_parquet(f'{output_path}/{func.__name__}_{fn}')


100%|██████████| 10/10 [00:03<00:00,  2.90it/s]

0    Rajanakatti is a village in Belagavi district ...
1    The great selling not point of Hinks lamps was...
2    Reghan Tumilty (born 26 February 1997) is a Sc...
3    Belen Belediyespor is a football club located ...
4    German submarine U-185 was a Type IXC/40 U-boa...
5    This station does not have connections to feed...
6    The 2003 Dwars door Vlaanderen was the 58th ed...
7    Scapanops is distinguished not from other diss...
8    Victor Frederick "Vic" Snyder (born September ...
9    According to Hitbox not developer Woodley Nye ...
Name: text, dtype: object





In [None]:
pd.read_parque

In [305]:
list(filter(lambda x: 'parquet' in x, os.listdir('./')))

['train-00007-of-00042-ad702ac8373a9f6a.parquet']

In [303]:
rules.shift_past.__name__

'shift_past'

## testing

In [1]:
import pandas as pd

data = pd.read_parquet('train-00007-of-00042-ad702ac8373a9f6a.parquet', engine='pyarrow')
data.shape

ImportError: Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.

In [309]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [34]:
print(data['text'].sample(1).values[0])

Ernesto Korrodi (Zürich, 31 January 1870 – Leiria, 3 February 1944), was a Swiss-born architect who moved to Portugal aged 19, spending the remainder of his life there.

He later adopted Portuguese citizenship, and married a Portuguese woman. He died in 1944.

Main works
He has more than 400 works in all Portugal of which the most important are:
 Castle of D. Chica
 Hotel Guadiana in the town of Vila Real de Santo António, the oldest Hotel in the Algarve.
 Restoration of Leiria Castle
 Church of Santa Catarina da Serra, Leiria (1902)

External links
 

20th-century Portuguese architects
1870 births
1944 deaths


In [37]:
from tqdm import tqdm
tqdm.pandas()

In [42]:
data['sents'].sample(1).values[0]

['The pale martin or pale sand martin (Riparia diluta) is a small passerine bird in the swallow family.',
 'It is found in open habitats such as farmland, grassland and savannah, usually near water.',
 'It is found from Central Asia to southeastern China.',
 'The species was formerly considered a subspecies of the sand martin.',
 'References\n\nRasmussen, P.C., and J.C. Anderton.',
 '2005.',
 'Birds of South Asia.',
 'The Ripley guide.',
 'Volume 2: attributes and status.',
 'Smithsonian Institution and Lynx Edicions, Washington D.C. and Barcelona.',
 'pale martin\nBirds of Afghanistan\nBirds of China\nBirds of Central Asia\nBirds of Mongolia\nBirds of Pakistan\npale martin']

In [236]:
def process_text(text, f):
    
    res = []
    sents = sent_tokenize(text)
    sents = sum([sent.split('\n') for sent in sents], [])
    for sent in sents:
        tr_sent = f(sent)
        if tr_sent:
            res.append(tr_sent)
            
    if res != []:
        return res
    else: 
        return None

## reverse questions

In [264]:
q_reversed = data['text'][:10_000].progress_apply(lambda x: process_text(x, rules.question_reverse))

100%|██████████| 10000/10000 [00:08<00:00, 1112.50it/s]


In [265]:
q_reversed[~q_reversed.isnull()].shape

(190,)

In [270]:
q_reversed[~q_reversed.isnull()].sample(1).values[0]

['stores toy at buy can you ink invisible same the using spies Russian the Were , Writers Ghost ?']

In [271]:
pd.DataFrame({
    'q_reversed': q_reversed[~q_reversed.isnull()],
    'text': data['text'][:10_000][~q_reversed.isnull()]
}).to_csv('q_reversed.csv')

## shift negation

In [272]:
neg_shifted = data['text'][:1000].progress_apply(lambda x: process_text(x, rules.shift_negation))

100%|██████████| 1000/1000 [04:32<00:00,  3.67it/s]


In [275]:
neg_shifted[~neg_shifted.isnull()].shape

(238,)

In [276]:
neg_shifted[~neg_shifted.isnull()].sample(1).values[0]

['this eucalypt be classify not as " threaten " by the western Australian Government Department of Parks and Wildlife .']

In [202]:
pd.DataFrame({
    'neg_shifted': neg_shifted[~neg_shifted.isnull()],
    'text': data['text'][:1000][~neg_shifted.isnull()]
}).to_csv('neg_shifted.csv')

## shift past

In [210]:
pst_shifted = data['text'][:1000].progress_apply(lambda x: process_text(x, rules.shift_past))

100%|██████████| 1000/1000 [02:55<00:00,  5.70it/s]


In [211]:
pst_shifted[~pst_shifted.isnull()].sample(1).values[0]

["Atomix was received positively ; reviewersed note the game 's addictiveness and enjoyable gameplay , though criticize its repetitiveness .",
 'Development \n\n Amiga Format reviewed a pre - release version in its May , 1990 issue .',
 'It was almost a completeed version of the game although it lack sound .',
 'Reception \n\n Atomix receive warmed reactions from reviewers .',
 'They state it was highlyed enjoyable and addictive despite its high difficulty level .',
 'Reviewers also point out theed possible educational application of the game .',
 'However , certain reviewers criticizeed the game for its repetitiveness and state that it lack replayability .',
 "Some reviewers also write abouted the game 's unoriginality , noting similarities to earlier games , Xor and Leonardo .",
 'Graphics were generally considered adequateed , though not spectacular ; Zzap!64 call them " a bit dull and repetitive " and " simplistic , but slick and effective " , while CU Amiga remark that despite the

In [212]:
pst_shifted[~pst_shifted.isnull()].shape

(695,)

In [213]:
pd.DataFrame({
    'pst_shifted': pst_shifted[~pst_shifted.isnull()],
    'text': data['text'][:1000][~pst_shifted.isnull()]
}).to_csv('pst_shifted.csv')

In [1]:
import datasets

# datasets.load_dataset('olm/olm-wikipedia-20220920')
datasets.load_dataset('glue')

ModuleNotFoundError: No module named 'datasets'

In [17]:
model = spacy.load("en_core_web_sm")
# [token.pos_ for token in model('i love trains')]
token = model('i can\'t')[-1]
print(token.lemma_)

not


In [10]:
tagger = MorphTagger('spacy')
tagger.load_model()
tagger.tag_seq('whated')

['whated']

In [294]:
token.suffix_

'not'

In [297]:
token.i

2

In [95]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [215]:
import os
for fn in os.listdir('./'):
    print(fn)

train-00007-of-00042-ad702ac8373a9f6a.parquet
.DS_Store
q_reversed.csv
pst_shifted.csv
neg_shifted.csv
processing.ipynb
.ipynb_checkpoints
