<small><i>This notebook was put together by [Alexander Fridman](http://www.rocketscience.ai) and [Volha Hedranovich](http://www.rocketscience.ai) for the Lecture Course. Source and license info is on [GitHub](https://github.com/volhahedranovich/jupyter_lectures).</i></small>

# <div class="alert alert-block alert-info">Text preprocessing</div>

In [None]:
import nltk


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('webtext')

## <div class="alert alert-block alert-success">Replacing words matching regular expressions</div>


```python
import re

replacement_patterns = [  
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'let\'s', 'let us'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer:
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
        
    def replace(self, text):
        s = text
        for pattern, repl in self.patterns:
            s = re.sub(pattern, repl, s)
        return s
            
replacer = RegexpReplacer()
replacer.replace("I should've done that thing I didn't do")
'I should have done that thing I did not do'
```

In [None]:
import re

replacement_patterns = [  
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'let\'s', 'let us'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer:
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex, re.IGNORECASE), repl) for (regex, repl) in patterns]

    def replace(self, text):
        s = text
        for pattern, repl in self.patterns:
            s = re.sub(pattern, repl, s)
        return s


def replace_by_regexps(text):
    """
    Applies RegexpReplacer to provided text
    :param text: an input text
    :return: result of RegexpReplacer work
    """
    # TODO: your code is here
    

text = "Let's do some NLP staff!"
assert replace_by_regexps(text) == 'let us do some NLP staff!'

## <div class="alert alert-block alert-success">Basic cleaning</div>

For simplicity let's lowercase text and replace all non word characters with space symbol.

In [None]:
def clean_text(text):
    """
    Perfomes a basic text cleaning
    
    :param text: an input text
    :return: a cleaned text
    """
    # TODO: your code is here
    

text = "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,"
assert clean_text(text) == 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'

## <div class="alert alert-block alert-success">Tokenization</div>

```python
from nltk.tokenize import word_tokenize


sent = 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'
word_tokenize(sent)
['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard', 'dummy', 'text', 'ever', 'since', 'the', '1500s']
```

In [None]:
def tokenize_text(text):
    """
    Tokenizes text using word_tokenize from NLTK
    :param text: an input text
    :return: a list of tokens
    """
    # TODO: your code is here


sent = 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'
tokens = tokenize_text(sent)
assert set(tokens) == {'ipsum', '1500s', 'the', 'since', 'text', 'been', 'ever',
                       'has', 'industry', 'lorem', 's', 'standard', 'dummy'}

## <div class="alert alert-block alert-success">Removing repeated characters</div>


```python
import re
from nltk.corpus import wordnet


class RepeatReplacer:
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        return repl_word
    
    
replacer = RepeatReplacer()
replacer.replace('goose')
'goose'
replacer.replace('looooove')
'love'
```

In [None]:
import re
from nltk.corpus import wordnet


class RepeatReplacer:
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        return repl_word
    

def remove_repeated_characters(text_tokens):
    """
    Removes repeated letters from tokens
    
    :param text_tokens: a list of text's tokens
    :return: tokens list
    """
    # TODO: your code is here


text_tokens = ['I', 'wooooould', 'like', 'to', 'showwww', 'you',
               'basic', 'text', 'preprocessing', 'stageeeeees']
assert remove_repeated_characters(text_tokens) == ['I', 'would', 'like', 'to', 'show',
                                            'you', 'basic', 'text', 'preprocesing', 'stagees']

## <div class="alert alert-block alert-success">Stopwords removal</div>


```python
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english'))

tokens = ['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard',
          'dummy', 'text', 'ever', 'since', 'the', '1500s']
tokens = [t for t in tokens if t not in en_stopwords]
```

In [None]:
def remove_stopwords(text_tokens):
    """
    Removes stopwords from a given list of tokens and words shorter than 3 chars
    
    :param text_tokens: a list of text's tokens
    :return: filtered tokens list
    """
    # TODO: your code is here
    

tokens = ['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard',
          'dummy', 'text', 'ever', 'since', 'the', '1500s']
assert remove_stopwords(tokens) == ['lorem', 'ipsum', 'industry', 'standard',
                                    'dummy', 'text', 'ever', 'since', '1500s']

## <div class="alert alert-block alert-success">Adding n-grams</div>


```python
from nltk.corpus import webtext, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures


stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset

words = [w.lower() for w in webtext.words('grail.txt')]

bcf = BigramCollocationFinder.from_words(words)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))
[("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')]

bcf.apply_word_filter(filter_stops)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))
[('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble')]
```

### Excercise:
1. Fetch 20newsgroups dataset
1. Combine 1st 100 texts in a single line
1. Lowercase and split by ' '
1. Filter stopwords
1. Find top 10 bigrams
1. Find top 10 trigrams

In [None]:
# TODO: your code is here

## <div class="alert alert-block alert-success">Spelling correction</div>


```python
import enchant
from nltk.metrics import edit_distance


class SpellingReplacer:
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        return word
    
    
replacer = SpellingReplacer()
replacer.replace('cookbok')
'cookbook'
```

In [None]:
import enchant
from nltk.metrics import edit_distance


class SpellingReplacer:
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist

    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        return word

    
def correct_spelling(text_tokens):
    """
    Corrects spelling using enchant package
    :param text_tokens: an input tokens list
    :return: a token list
    """
    # TODO: your code is here


tokens = ['cookbokc', 'mother', 'fother', 'pythen']
assert correct_spelling(tokens) == ['cookbook', 'mother', 'other', 'python']

## <div class="alert alert-block alert-success">Lemmatization</div>


```python
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('cooking', 'v')
'cook'
lemmatizer.lemmatize('texts', 'n')
'text'
```

In [None]:
def lemmatize(text_tokens):
    """
    Lemmatizies provided list of tokens
    :param text_tokens: an input tokens list
    :return: a token list
    """
    # TODO: your code is here


tokens = ['texts', 'books', 'tables', 'pythons']
assert lemmatize(tokens) == ['text', 'book', 'table', 'python']

## <div class="alert alert-block alert-success">Stemming</div>


```python
from nltk.stem.porter import PorterStemmer


stemmer = PorterStemmer()

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
           'died', 'agreed', 'owned', 'humbled', 'sized',
           'meeting', 'stating', 'siezing', 'itemization',
           'sensational', 'traditional', 'reference', 'colonizer',
           'plotted']
singles = [stemmer.stem(plural) for plural in plurals]

['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own',
 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit',
 'refer', 'colon', 'plot']

```

In [None]:
def stem(text_tokens):
    """
    Stems provided list of tokens
    :param text_tokens: an input tokens list
    :return: a token list
    """
    # TODO: your code is here


tokens = ['texts', 'books', 'tables', 'pythons']
assert stem(tokens) == ['text', 'book', 'tabl', 'python']

## <div class="alert alert-block alert-success">Adding synonyms</div>


```python
from nltk.corpus import wordnet


synset = wordnet.synsets('dummy')[0]
synset.lemma_names()
['dummy', 'silent_person']
```

In [None]:
def add_synonyms(text_tokens, n_synonyms=2):
    """
    Adds synonyms to tokens list
    
    :param text_tokens: an input tokens list
    :param n_synonyms: count of synonyms to add
    :return: a token list
    """
    # TODO: your code is here


tokens = ['lorem', 'ipsum', 'industry', 'standard', 'dummy', 'text', 'ever', 'since', '1500s']
assert set(add_synonyms(tokens)) == {'industry', 'lorem', 'since',
                                     'ever', 'of_all_time', 'ipsum',
                                     'text', 'criterion', 'standard',
                                     'textual_matter', 'dummy', 'silent_person',
                                     '1500s'}

# <div class="alert alert-block alert-info">Classifing 20 news groups dataset</div>

#### Loading dataset

In [None]:
from sklearn.datasets import fetch_20newsgroups


dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

X = dataset['data']
y = dataset['target']

#### Applying prepropcessing

In [None]:
def text_preprocessing_pipeline(X):
    from tqdm import tqdm_notebook
    
    X_processed = []
    
    for x in tqdm_notebook(X):
        x = replace_by_regexps(x)
        x = clean_text(x)
        x = tokenize_text(x)
        x = remove_repeated_characters(x)
        x = remove_stopwords(x)
        # x = correct_spelling(x) # disable spelling correction because of slow work
        x = lemmatize(x)
        x = add_synonyms(x)
        x = ' '.join(x)
        X_processed.append(x)
    
    return X_processed

X = text_preprocessing_pipeline(X)

#### Saving preprocessed data

In [None]:
import pickle


with open('data.p', 'wb') as f:
    pickle.dump((X, y), f)
    
with open('data.p', 'rb') as f:
    X, y = pickle.load(f)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

#### Train/test splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

#### Building pipeline

In [None]:
pipeline = make_pipeline(
    # TODO: your code is here
)

pipeline.steps

#### Encoding target

In [None]:
# TODO: your code is here

#### Performing grid search cv

In [None]:
# TODO: your code is here

#### Assesing model perfomance

In [None]:
# TODO: your code is here