In [1]:
import spacy, re, nltk
import pandas as pd
import string
from scispacy.abbreviation import AbbreviationDetector
from autocorrect import Speller

from nltk.corpus import stopwords, wordnet

## Text pre-processing procedure
1. Import raw text data
2. Convert to lower casing
3. Remove punctuations
4. Expand abbreviations
5. Perform tokenization
6. Remove stop words
7. Perform lemmatization

## Performing Tokenization on Sample Text

In [2]:
nlp = spacy.load('en_core_web_sm')

text = 'He said, "we\'d have eaten more than 100 hamburgers from yesterday."'

token_sentence = nlp(text)

token_dict = {}
for token in token_sentence:
    token_dict[token.text] = [token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]

pd.DataFrame(token_dict).T.reset_index().rename(columns={'index': 'Text', 
                                                         0: 'Lemma',
                                                         1: 'PartofSpeech', 
                                                         2: 'Tag',
                                                         3: 'Dep',
                                                         4: 'Shape',
                                                         5: 'is_alpha',
                                                         6: 'is_stop'})

Unnamed: 0,Text,Lemma,PartofSpeech,Tag,Dep,Shape,is_alpha,is_stop
0,He,-PRON-,PRON,PRP,nsubj,Xx,True,True
1,said,say,VERB,VBD,ROOT,xxxx,True,False
2,",",",",PUNCT,",",punct,",",False,False
3,"""","""",PUNCT,'',punct,"""",False,False
4,we,-PRON-,PRON,PRP,nsubj,xx,True,True
5,'d,'d,VERB,MD,aux,'x,False,True
6,have,have,AUX,VB,aux,xxxx,True,True
7,eaten,eat,VERB,VBN,ccomp,xxxx,True,False
8,more,more,ADJ,JJR,amod,xxxx,True,True
9,than,than,SCONJ,IN,quantmod,xxxx,True,True


In [3]:
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

def preprocess_token(text):
    doc = nlp(text)

    def is_token_allowed(token):
        if (not token or not token.string.strip() or token.is_stop or token.is_punct):
            return False
        else:
            return True

    tokens = [token.text for token in doc]

    for abrv in doc._.abbreviations:
        tokens[abrv.start] = str(abrv._.long_form)

    doc = nlp(' '.join(tokens))
            
    tokens = set([token.lemma_.strip().lower() for token in doc if is_token_allowed(token)])

    return list(tokens)

preprocessed_tokens = preprocess_token(text)
preprocessed_tokens

['say', '100', 'yesterday', 'eat', 'hamburger']

## Import and print Text data file

In [4]:
with open('E:\School stuff/1.3.2 workshop/1.3.2 workshop/questionbase_raw.txt', encoding='UTF-8') as file:
    raw_sentences = [sentence.replace('\n', '') for sentence in file.readlines()]

# Drop
for raw_sentence in raw_sentences:
    if (raw_sentence == 'Q') or (raw_sentence == 'A'):
        raw_sentences.remove(raw_sentence)

for i, raw_sentence in enumerate(raw_sentences):
    print(i, raw_sentence)

0 Hello
1 Hello, I am ASD knowledge bot. Feel free to ask me anything about autism spectrum disorder (ASD).
2 What is definition of Autistic Spectrum Disorder?
3 Autism, or autism spectrum disorder (ASD), refers to a broad range of conditions characterized by challenges with social skills, repetitive behaviors, speech and nonverbal communication. According to the Centers for Disease Control, autism affects an estimated 1 in 54 children in the United States today.
4 What are the symptoms of Autistic Spectrum Disorder?
5 Making little or inconsistent eye contact. 
6 Tending not to look at or listen to people.
7 Rarely sharing enjoyment of objects or activities by pointing or showing things to others.
8 Failing to, or being slow to, respond to someone calling their name or to other verbal attempts to gain attention.
9 Having difficulties with the back and forth of conversation.
10 Often talking at length about a favorite subject without noticing that others are not interested or without g

## Provided processing wrapper functions

We used autocorrect.Speller library to form the spell_checker function.

#### Quiz: How to implement this function without using str.lower()?
We can implement str.casefold() instead of str.lower().

In [5]:
def lower_casing(sentence):
    new_sentence = sentence.lower()
    return new_sentence

def expand_abbriviation(sentence):
    replacement_patterns = [
        (r'won\'t', 'will not'),
        (r'can\'t', 'cannot'),
        (r'i\'m', 'i am'),
        (r'ain\'t', 'is not'),
        (r'(\w+)\'ll', '\g<1> will'),
        (r'(\w+)n\'t', '\g<1> not'),
        (r'(\w+)\'ve', '\g<1> have'),
        (r'(\w+)\'s', '\g<1> is'),
        (r'(\w+)\'re', '\g<1> are'),
        (r'(\w+)\'d', '\g<1> would')]
    patterns = [(re.compile(regex), repl) for (regex, repl) in replacement_patterns]

    new_sentence = sentence
    for (pattern, repl) in patterns:
        (new_sentence, count) = re.subn(pattern, repl, new_sentence)
    return new_sentence

def punctuation_removal(sentence):
    # Remove the all the punctuations except '
    new_sentence = re.sub(',|!|\?|\"|<|>|\(|\)|\[|\]|\{|\}|@|#|\+|\=|\-|\_|~|\&|\*|\^|%|\||\$|/|`|\.|\'',
                          '', sentence,count=0, flags=0)
    return new_sentence

def tokenization(sentence):
    new_sentence = nltk.word_tokenize(sentence)
    return new_sentence

def stopword_removal(sentence):
    stoplist = stopwords.words('english')
     
    with open(r'E:\School stuff\1.3.2 workshop\1.3.2 workshop/stopwords.txt') as file:
        stoplist = [stopword.replace('\n', '').lower() for stopword in file.readlines()]
    
    new_sentence = [word for word in sentence if word not in stoplist]
    return new_sentence

def get_wordnet_pos(word):
    pack = nltk.pos_tag([word])
    tag = pack[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV

    else:
        return None

def lemmatization(sentence):
    lemmatizer = nltk.stem.WordNetLemmatizer()

    new_sentence = [lemmatizer.lemmatize(word, get_wordnet_pos(word) or wordnet.NOUN) for word in sentence]

    return new_sentence

def spell_checker(sentence):
    spell = Speller(lang='en')

    new_sentence = spell(sentence)

    return new_sentence

def text_preprocessing(raw_sentence):
    sentence = lower_casing(raw_sentence)
    sentence = spell_checker(sentence)
    sentence = punctuation_removal(sentence)
    sentence = expand_abbriviation(sentence)
    sentence = tokenization(sentence)
    sentence = stopword_removal(sentence)
    sentence = lemmatization(sentence)
    return sentence

In [6]:
i = 1
for raw_sentence in raw_sentences:
    processed_spacy = preprocess_token(raw_sentence)
    processed_custom = text_preprocessing(raw_sentence)
    if raw_sentence != 'Q' and raw_sentence != 'A':
        print('Sentence', i)
        print('Original Sentence:', raw_sentence)
        print('Own processing:', sorted(processed_spacy))
        print('Provided processing:', sorted(processed_custom))
        i += 1
        print()

Sentence 1
Original Sentence: Hello
Own processing: ['hello']
Provided processing: []

Sentence 2
Original Sentence: Hello, I am ASD knowledge bot. Feel free to ask me anything about autism spectrum disorder (ASD).
Own processing: ['ask', 'autism', 'bot', 'disorder', 'feel', 'free', 'hello', 'knowledge', 'spectrum']
Provided processing: ['asd', 'asd', 'autism', 'bot', 'disorder', 'feel', 'free', 'knowledge', 'spectrum']

Sentence 3
Original Sentence: What is definition of Autistic Spectrum Disorder?
Own processing: ['autistic', 'definition', 'disorder', 'spectrum']
Provided processing: ['autistic', 'definition', 'disorder', 'spectrum']

Sentence 4
Original Sentence: Autism, or autism spectrum disorder (ASD), refers to a broad range of conditions characterized by challenges with social skills, repetitive behaviors, speech and nonverbal communication. According to the Centers for Disease Control, autism affects an estimated 1 in 54 children in the United States today.
Own processing: ['1

## Conclusion

Comparing both preprocessing functions, they are slightly different in some aspects like;

1. SpaCy's stop word library is slightly different compared to NLTK stop word library that was used in the provided preprocessing functions. Some words were omitted in provided preprocessing function but was not removed in SpaCy '.is_stop' tokenization function.
2. Duplicates were not removed in the provided preprocessing functions.
3. Provided preprocessing function was lengthy but customizable.
4. Some punctuations were still present after the provided preprocessing function.
5. Some abbreviations like 'ASD' were not expanded in the provided preprocessing function.

In conclusion, SpaCy's implementation was more efficient but an integrated implementation between SpaCy and NLTK libraries could be more efficient and accurate.