<small><i>This notebook was put together by [Alexander Fridman](http://www.rocketscience.ai) and [Volha Hedranovich](http://www.rocketscience.ai) for the Lecture Course. Source and license info is on [GitHub](https://github.com/volhahedranovich/jupyter_lectures).</i></small>

In [1]:
import nltk


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('webtext')

[nltk_data] Downloading package punkt to /home/volha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/volha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/volha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package webtext to /home/volha/nltk_data...
[nltk_data]   Package webtext is already up-to-date!


True

## Replacing words matching regular expressions


```python
import re

replacement_patterns = [  
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'let\'s', 'let us'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer:
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
        
    def replace(self, text):
        s = text
        for pattern, repl in self.patterns:
            s = re.sub(pattern, repl, s)
        return s
            
replacer = RegexpReplacer()
replacer.replace("I should've done that thing I didn't do")
'I should have done that thing I did not do'
```

In [2]:
import re

replacement_patterns = [  
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'let\'s', 'let us'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer:
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex, re.IGNORECASE), repl) for (regex, repl) in patterns]

    def replace(self, text):
        s = text
        for pattern, repl in self.patterns:
            s = re.sub(pattern, repl, s)
        return s


def replace_by_regexps(text):
    """
    Applies RegexpReplacer to provided text
    :param text: an input text
    :return: result of RegexpReplacer work
    """
    # TODO: your code is here
    

def replace_by_regexps(text):
    """
    Applies RegexpReplacer to provided text
    :param text: an input text
    :return: result of RegexpReplacer work
    """
    return RegexpReplacer().replace(text)


text = "Let's do some NLP staff!"
assert replace_by_regexps(text) == 'let us do some NLP staff!'

## Basic cleaning with regexps

For simplicity let's lowercase text and replace all non word characters with space symbol.

TODO: link or short regexp example

In [3]:
def clean_text(text):
    """
    Perfomes a basic text cleaning
    
    :param text: an input text
    :return: a cleaned text
    """
    # TODO: your code is here
    
    
def clean_text(text):
    """
    Perfomes a basic text cleaning
    
    :param text: an input text
    :return: a cleaned text
    """
    import re
    
    text = text.lower()
    text = re.sub('[^\w]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text


text = "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,"
assert clean_text(text) == 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'

## Tokenization


```python
from nltk.tokenize import word_tokenize


sent = 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'
word_tokenize(sent)
['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard', 'dummy', 'text', 'ever', 'since', 'the', '1500s']
```

In [4]:
def tokenize_text(text):
    """
    Tokenizes text using word_tokenize from NLTK
    :param text: an input text
    :return: a list of tokens
    """
    # TODO: your code is here


def tokenize_text(text):
    """
    Tokenizes text using word_tokenize from NLTK
    :param text: an input text
    :return: a list of tokens
    """
    from nltk.tokenize import word_tokenize
    return word_tokenize(text, language='english')


sent = 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'
tokens = tokenize_text(sent)
assert set(tokens) == {'ipsum', '1500s', 'the', 'since', 'text', 'been', 'ever',
                       'has', 'industry', 'lorem', 's', 'standard', 'dummy'}

## Removing repeated characters


```python
import re
from nltk.corpus import wordnet


class RepeatReplacer:
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        return repl_word
    
    
replacer = RepeatReplacer()
replacer.replace('goose')
'goose'
replacer.replace('looooove')
'love'
```

In [5]:
import re
from nltk.corpus import wordnet


class RepeatReplacer:
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        return repl_word
    

def remove_repeated_characters(text_tokens):
    """
    Removes repeated letters from tokens
    
    :param text_tokens: a list of text's tokens
    :return: tokens list
    """
    # TODO: your code is here
    
    
def remove_repeated_characters(text_tokens):
    """
    Removes repeated letters from tokens
    
    :param text_tokens: a list of text's tokens
    :return: tokens list
    """
    replacer = RepeatReplacer()
    return [replacer.replace(t) for t in text_tokens]


text_tokens = ['I', 'wooooould', 'like', 'to', 'showwww', 'you',
               'basic', 'text', 'preprocessing', 'stageeeeees']
assert remove_repeated_characters(text_tokens) == ['I', 'would', 'like', 'to', 'show',
                                            'you', 'basic', 'text', 'preprocesing', 'stagees']

## Stopwords removal


```python
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english'))

tokens = ['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard',
          'dummy', 'text', 'ever', 'since', 'the', '1500s']
tokens = [t for t in tokens if t not in en_stopwords]
```

In [6]:
def remove_stopwords(text_tokens):
    """
    Removes stopwords from a given list of tokens and words shorter than 3 chars
    
    :param text_tokens: a list of text's tokens
    :return: filtered tokens list
    """
    # TODO: your code is here
    

def remove_stopwords(text_tokens):
    """
    Removes stopwords from a given list of tokens and words shorter than 3 chars
    
    :param text_tokens: a list of text's tokens
    :return: filtered tokens list
    """
    from nltk.corpus import stopwords

    en_stopwords = set(stopwords.words('english'))
    return [t for t in text_tokens if t not in en_stopwords and len(t) >= 3]


tokens = ['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard',
          'dummy', 'text', 'ever', 'since', 'the', '1500s']
assert remove_stopwords(tokens) == ['lorem', 'ipsum', 'industry', 'standard',
                                    'dummy', 'text', 'ever', 'since', '1500s']

## Adding n-grams


```python
from nltk.corpus import webtext, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures


stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset

words = [w.lower() for w in webtext.words('grail.txt')]

bcf = BigramCollocationFinder.from_words(words)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))
[("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')]

bcf.apply_word_filter(filter_stops)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))
[('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble')]
```

### Excercise:
1. Fetch 20newsgroups dataset
1. Combine 1st 100 texts in a single line
1. Lowercase and split by ' '
1. Filter stopwords
1. Find top 10 bigrams
1. Find top 10 trigrams

In [7]:
# TODO: your code is here

In [8]:
from sklearn.datasets import fetch_20newsgroups
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures


dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
texts = dataset['data'][:100]
texts_as_a_single_line = ' '.join(texts)
words = texts_as_a_single_line.lower().split()

bcf = BigramCollocationFinder.from_words(words)

print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))

[('of', 'the'), ('i', 'have'), ('in', 'the'), ('i', 'am')]


## Spelling correction


```python
import enchant
from nltk.metrics import edit_distance


class SpellingReplacer:
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        return word
    
    
replacer = SpellingReplacer()
replacer.replace('cookbok')
'cookbook'
```

In [9]:
import enchant
from nltk.metrics import edit_distance


class SpellingReplacer:
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist

    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        return word

    
def correct_spelling(text_tokens):
    """
    Corrects spelling using enchant package
    :param text_tokens: an input tokens list
    :return: a token list
    """
    # TODO: your code is here
    

def correct_spelling(text_tokens):
    """
    Corrects spelling using enchant package
    :param text_tokens: an input tokens list
    :return: a token list
    """
    replacer = SpellingReplacer()
    return [replacer.replace(w) for w in text_tokens]


tokens = ['cookbokc', 'mother', 'fother', 'pythen']
assert correct_spelling(tokens) == ['cookbook', 'mother', 'other', 'python']

## Lemmatization


```python
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('cooking', 'v')
'cook'
lemmatizer.lemmatize('texts', 'n')
'text'
```

In [10]:
def lemmatize(text_tokens):
    """
    Lemmatizies provided list of tokens
    :param text_tokens: an input tokens list
    :return: a token list
    """
    # TODO: your code is here


def lemmatize(text_tokens):
    """
    Lemmatizies provided list of tokens
    :param text_tokens: an input tokens list
    :return: a token list
    """
    from nltk.stem import WordNetLemmatizer
    
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(t, 'n') for t in text_tokens]

tokens = ['texts', 'books', 'tables', 'pythons']
assert lemmatize(tokens) == ['text', 'book', 'table', 'python']

## Stemming

```python
from nltk.stem.porter import PorterStemmer


stemmer = PorterStemmer()

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
           'died', 'agreed', 'owned', 'humbled', 'sized',
           'meeting', 'stating', 'siezing', 'itemization',
           'sensational', 'traditional', 'reference', 'colonizer',
           'plotted']
singles = [stemmer.stem(plural) for plural in plurals]

['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own',
 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit',
 'refer', 'colon', 'plot']

```

In [11]:
def stem(text_tokens):
    """
    Stems provided list of tokens
    :param text_tokens: an input tokens list
    :return: a token list
    """
    # TODO: your code is here


def stem(text_tokens):
    """
    Lemmatizies provided list of tokens
    :param text_tokens: an input tokens list
    :return: a token list
    """
    from nltk.stem.porter import PorterStemmer

    stemmer = PorterStemmer()
    return [stemmer.stem(t) for t in text_tokens]

tokens = ['texts', 'books', 'tables', 'pythons']
assert stem(tokens) == ['text', 'book', 'tabl', 'python']

## Adding synonyms

```python
from nltk.corpus import wordnet


synset = wordnet.synsets('dummy')[0]
synset.lemma_names()
['dummy', 'silent_person']
```

In [12]:
def add_synonyms(text_tokens, n_synonyms=2):
    """
    Adds synonyms to tokens list
    
    :param text_tokens: an input tokens list
    :param n_synonyms: count of synonyms to add
    :return: a token list
    """
    # TODO: your code is here
    
    
def add_synonyms(text_tokens, n_synonyms=2):
    """
    Adds synonyms to tokens list
    
    :param text_tokens: an input tokens list
    :return: a token list
    """
    import itertools
    from nltk.corpus import wordnet
    
    extended_tokens = []
    
    for token in text_tokens:
        synsets = wordnet.synsets(token)
        
        if synsets:
            synset = synsets[0]
            extended_tokens.extend(synset.lemma_names()[:n_synonyms])
        else:
            extended_tokens.append(token)
            
    return extended_tokens


tokens = ['lorem', 'ipsum', 'industry', 'standard', 'dummy', 'text', 'ever', 'since', '1500s']
assert set(add_synonyms(tokens)) == {'industry', 'lorem', 'since',
                                     'ever', 'of_all_time', 'ipsum',
                                     'text', 'criterion', 'standard',
                                     'textual_matter', 'dummy', 'silent_person',
                                     '1500s'}

## Classifing 20 news groups

#### Loading dataset

In [13]:
from sklearn.datasets import fetch_20newsgroups


dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

X = dataset['data']
y = dataset['target']

#### Applying prepropcessing

In [14]:
def text_preprocessing_pipeline(X):
    from tqdm import tqdm_notebook
    
    X_processed = []
    
    for x in tqdm_notebook(X):
        x = replace_by_regexps(x)
        x = clean_text(x)
        x = tokenize_text(x)
        x = remove_repeated_characters(x)
        x = remove_stopwords(x)
        # x = correct_spelling(x) # disable spelling correction because of slow work
        x = lemmatize(x)
        x = add_synonyms(x)
        x = ' '.join(x)
        X_processed.append(x)
    
    return X_processed

X = text_preprocessing_pipeline(X)

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





#### Saving preprocessed data

In [16]:
import pickle


with open('data.p', 'wb') as f:
    pickle.dump((X, y), f)
    
with open('data.p', 'rb') as f:
    X, y = pickle.load(f)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#### Train/test splitting

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

#### Building pipeline

In [22]:
def to_dense(x): return x.todense()

pipeline = make_pipeline(
    TfidfVectorizer(max_features=1000),
    RandomForestClassifier()
)

pipeline.steps

[('tfidfvectorizer',
  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=1000, min_df=1,
          ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('randomforestclassifier',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False))]

#### Encoding target

In [23]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

#### Performing grid search cv

In [None]:
param_space = {
    'randomforestclassifier__n_estimators': [10, 100, 1000],
    'randomforestclassifier__max_depth': [5, 10, 20]
}

clf = GridSearchCV(pipeline, param_space, cv=StratifiedKFold(),
                   verbose=8, scoring='f1_weighted', n_jobs=-1)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))