In [None]:
import spacy
import re
from spacy import Language
from spacy.tokenizer import Tokenizer
from spacy.symbols import ORTH
from spacy.util import compile_infix_regex


In [None]:
text="hi there!I'm mr.Chunky.Born on 27th September 2023.Highly qualified cat..."\
"i'm naughty day in and day out.Very active and deadly."\
"i sneak myself in and cajole everyone into handing over my food"\
"my sleep time is from 7:00 am to 11:00pm"\
"nah-uh,its typically the entire day ;)"

In [None]:
nlp=spacy.load("en_core_web_sm")
#loading the model and using its features

doc=nlp(text)
print(doc)



hi there!I'm mr.Chunky.Born on 27th September 2023.Highly qualified cat...i'm naughty day in and day out.Very active and deadly.i sneak myself in and cajole everyone into handing over my foodmy sleep time is from 7:00 am to 11:00pmnah-uh,its typically the entire day ;)


In [None]:
tokens=[token.text for token in doc]
print(tokens)

['hi', "there!I'm", 'mr', '.', 'Chunky', '.', 'Born', 'on', '27th', 'September', '2023.Highly', 'qualified', 'cat', '...', "i'm", 'naughty', 'day', 'in', 'and', 'day', 'out', '.', 'Very', 'active', 'and', 'deadly.i', 'sneak', 'myself', 'in', 'and', 'cajole', 'everyone', 'into', 'handing', 'over', 'my', 'foodmy', 'sleep', 'time', 'is', 'from', '7:00', 'am', 'to', '11:00pmnah', '-', 'uh', ',', 'its', 'typically', 'the', 'entire', 'day', ';)']


In [None]:
#add custom tokenisation
#1.create a custom tokenisation function
#2.create rules as regular expressions as infixes
#3.compile infixes using compile_infix_regex
#4.return an instance of token with nlp.vocab for storage of lexical items and infex_finditer to  identify the infixes

def custom_token(nlp):
  infix=(
        r"(?=<[0-9])[+\-\*^](?=[0-9-])",
            )
  infix_final=compile_infix_regex(infix)
  return Tokenizer(nlp.vocab,infix_finditer=infix_final.finditer)



In [None]:
#loading the model
#making the nlp tokenizer use our custom tokenizer

nlp=spacy.load("en_core_web_sm")
nlp.tokenizer=custom_token(nlp)
print(nlp)
type(nlp)

<spacy.lang.en.English object at 0x7bd4d72baa70>


In [None]:
#pass the text to nlp and obtain doc
doc=nlp(text)
print(doc)

words=[word.text for word in doc]
print(words)

hi there!I'm mr.Chunky.Born on 27th September 2023.Highly qualified cat...i'm naughty day in and day out.Very active and deadly.i sneak myself in and cajole everyone into handing over my foodmy sleep time is from 7:00 am to 11:00pmnah-uh,its typically the entire day ;)
['hi', "there!I'm", 'mr.Chunky.Born', 'on', '27th', 'September', '2023.Highly', 'qualified', "cat...i'm", 'naughty', 'day', 'in', 'and', 'day', 'out.Very', 'active', 'and', 'deadly.i', 'sneak', 'myself', 'in', 'and', 'cajole', 'everyone', 'into', 'handing', 'over', 'my', 'foodmy', 'sleep', 'time', 'is', 'from', '7:00', 'am', 'to', '11:00pmnah-uh,its', 'typically', 'the', 'entire', 'day', ';)']


In [None]:
#to understand coloquialisms or other texts,special cases can be added
text2="i wanna go play today."\
"lemme tell you,playing kepts u active and releases endorphins"\
"endorphins are hapy harmones"\
"it is C.P.U for physical activity"



In [None]:
#creating custom cases that are list of dictionaries using orth variable
lemme_case=[{ORTH:"lem"},{ORTH:"me"}]
wanna_case=[{ORTH:"wan"},{ORTH:"na"}]
cpu_case=[{ORTH:"C.P.U"}]

nlp.tokenizer.add_special_case("lemme",lemme_case)
nlp.tokenizer.add_special_case("wanna",wanna_case)
nlp.tokenizer.add_special_case("C.P.U",cpu_case)


In [None]:

doc=nlp(text2)
words=[(words.text,words.orth_) for words in doc]
print(words)

[('i', 'i'), ('wan', 'wan'), ('na', 'na'), ('go', 'go'), ('play', 'play'), ('today.lemme', 'today.lemme'), ('tell', 'tell'), ('you,playing', 'you,playing'), ('kepts', 'kepts'), ('u', 'u'), ('active', 'active'), ('and', 'and'), ('releases', 'releases'), ('endorphinsendorphins', 'endorphinsendorphins'), ('are', 'are'), ('hapy', 'hapy'), ('harmonesit', 'harmonesit'), ('is', 'is'), ('C.P.U', 'C.P.U'), ('for', 'for'), ('physical', 'physical'), ('activity', 'activity')]


In [None]:
text3="hello...im vibha...learning spacy... and nltk"

In [None]:
@Language.component("custom_elipses_sent")

def custom_elipses_sent(doc):
  for token in doc[:-1]:
    if token.text=="...":
      doc[token.i  + 1].is_sent_start=True
  return doc

nlp.add_pipe("custom_elipses_sent",before="parser")




In [None]:
nlp=spacy.load("en_core_web_sm")
nlp.tokenizer=custom_token(nlp)
doc=nlp(text3)
for index,sentence in enumerate(doc.sents,start=1):
  print(f'Sentence {index}:{sentence}')
for token in doc:
  print(token.text)
  print()

Sentence 1:hello...im vibha...learning spacy... and nltk
hello...im

vibha...learning

spacy...

and

nltk

