In [89]:
import hashlib # for grading

# Standard imports
import numpy as np
import pandas as pd
from collections import Counter, OrderedDict
import re
import string
import math
import warnings; warnings.simplefilter('ignore')

# NLTK imports
import nltk
nltk.download('stopwords')

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# SKLearn related imports
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valentynakoshelnyk/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Q1. Country names

For the first question you will be making use of regex. In particular you have a list of countries and you'll have to answer some very specific questions about that list.

Start by loading the defining the path to this list


In [14]:
path = "data/countries.txt"

The first thing you will build is a wrapper that will apply a regex pattern into a given file, and return a list of results found matching that pattern. Implement it in the function below:

In [15]:
def find_all_in_file(pattern, path):
    """
    Function that returns all matches of a certain pattern in a certain text.
    
    Args:
    pattern - regex pattern
    path - path to the file
    """
    
    # YOUR CODE HERE
    


    lineList = [line.rstrip('\n') for line in open(path)]
    return [i for i in lineList if re.findall(pattern, i)]


   
        
        
    
   
                
        
       
        



Make sure this function is working with the following tests:

In [16]:
assert find_all_in_file(pattern="^P.+?$", path=path)[8] == "Portugal"
assert find_all_in_file(pattern="^.+?a$", path=path)[18] == "Croatia"
assert len(find_all_in_file(pattern="^.+?ca$", path=path)) == 4

#### Q1.a)

Now that you prepared your wrapper, let's move on to the actual expressions. The first thing we are looking for is for countries with loooong names. In particular we want you to find all the countries with more than 15 letters. Use the wrapper you defined above and assign its return to a variable `ret`.

In [17]:
# ret_long = find_all_in_file(...)

# YOUR CODE HERE
ret_long = find_all_in_file(pattern = '^.{15,}$', path = path)


In [18]:
print("Number of countries with more than 15 or more letters: ", len(ret_long))
assert len(ret_long) == 16

Number of countries with more than 15 or more letters:  16


#### Q1.b)

Now, find out how many countries:
* Start with a vowel
* Start with a consonant

In [19]:
# ret_vowel = find_all_in_file(...)

# YOUR CODE HERE
ret_vowel = find_all_in_file(pattern = "^[aieouAIEOU].*", path = path)


In [20]:
print("Number of countries that start with vowels: " , len(ret_vowel))
assert len(ret_vowel) == 36

Number of countries that start with vowels:  36


In [21]:
# ret_consonant = find_all_in_file(...)

# YOUR CODE HERE
ret_consonant = find_all_in_file(pattern ="^[^aeyiuoAIEOU].*", path = path)

In [22]:
print("Number of countries that start with consonants: " , len(ret_consonant))
assert len(ret_consonant) == 160

Number of countries that start with consonants:  160


#### Q1.c)

Next, find how many countries are composed by only one word and end in `ia`. You'll want to have a list with countries such as `Croatia`, `Serbia`, etc.

In [23]:
# ret_ia = find_all_in_file(...)

# YOUR CODE HERE


In [24]:
print("Number of variants of countries ending in \"ia\": " , len(ret_ia))
assert "Serbia" in ret_ia
assert "Croatia" in ret_ia
assert len(ret_ia) == 35

NameError: name 'ret_ia' is not defined

#### Q1.d)

Finally, find the countries which have at least four consecutive consonants, without taking into account the first letter (Hint: you can assume the first letter is capitalized). So, it should match things like `Abcdf`.

In [25]:
# ret_bcdf = find_all_in_file(...)

# YOUR CODE HERE
ret_bcdf = find_all_in_file(pattern = '(?:(?![aeiou])[a-z]){4,}', path = path)

In [26]:
print("Number of countries matched: " , len(ret_bcdf))
assert len(ret_bcdf) == 3
assert hashlib.sha256(' '.join(ret_bcdf).encode()).hexdigest() == '7da1a15074b9245ae3b88fb92fc5c484243003a084d03280bf11d9346d768869'

Number of countries matched:  3


## Q2. A Study in Scarlet

For this following questions we will be looking at Sir Arthur Conan Doyle's ["A Study in Scarlet"](https://en.wikipedia.org/wiki/A_Study_in_Scarlet) (which you might have seen adapted to tv in ["A Study in Pink"](https://en.wikipedia.org/wiki/A_Study_in_Pink)). We will be performing common preprocessing operations on this text, as it is a common task in Natural Language Processing. Start by downloading the data and loading it into a list of sentences:

In [27]:
path = "data/sherlock.txt"
data =  [line.strip('\n') for line in open(path, 'r', encoding='utf8') if len(line)>1]

#### Q2.a)

First tokenize the data. Implement the function to receive an NLTK-style tokenizer and return the token list for each sentence:

In [28]:
def apply_tokenizer(data, tokenizer):
    """
    Returns a list of lists, with the tokens of given text. I.e
    for an input ['Abc def', 'Ghi jkl mn'] it returns [['Abc', 'def'], ['Ghi', 'jkl', 'mn']]
    
    Args:
    data - list with the data
    tokenizer - nltk tokenizer
    """
    
    # YOUR CODE HERE
    tokenizer = WordPunctTokenizer()
    return [tokenizer.tokenize(i) for i in data]


    


In [29]:
tokenizer = WordPunctTokenizer()
data_tok = apply_tokenizer(data=data, tokenizer=tokenizer)

assert len(data_tok) == 3770
assert len([w for s in data_tok for w in s]) == 51648
assert data_tok[8] == ['I','could','join','it',',','the','second','Afghan','war','had','broken','out','.','On','landing','at']

#### Q2.b)

The second step you will implement is lowercasing the data.

In [30]:
def apply_lowercase(data):
    """
    Returns a list of lists, with all the tokens lowecased.
    
    Args:
    data - list with tokenized data
    """

    # YOUR CODE HERE
    return [[w.lower() for w in line] for line in data]


In [31]:
data_lc = apply_lowercase(data=apply_tokenizer(data=data, tokenizer=tokenizer))

assert len(data_lc) == 3770
assert len([w for s in data_lc for w in s]) == 51648
assert data_lc[8] == ['i','could','join','it',',','the','second','afghan','war','had','broken','out','.','on','landing','at']

#### Q2.c)

Now implement a function that filters the stopwords.

NOTE: Stopwords adapted from [here](https://gist.github.com/sebleier/554280). (Notice what we added some specific things, like ?" and ." to the stopwords. This was shown to be a limitation of the nltk tokenizer so it will be removed that way, instead of the more conventional way. This goes to show that there are more powerful tokenizers that you should use in the case you have to perform tokenization in the future.)

In [32]:
def apply_filter_stopwords(data, stopwords_fp):
    """
    Returns a list of lists, with no stopwords.
    
    Args:
    data - list with the tokenized data
    stopwords_fp - path to the stopwords file
    """
    #data_filt = [token for token in apply_lowercase(apply_tokenizer(data, tokenizer)) if not token in stopwords]
    # Create the list of stopwords from the file
    # stopwords = ...
    # YOUR CODE HERE
    # Filter the stopwords from the text
    # data_filt = ...
    # YOUR CODE HERE
    
    stopwords = [line.strip() for line in open(stopwords_fp)]



    data=apply_lowercase(data)
    data_filt= list(map(lambda x: [i for i in x if not i.lower() in stopwords], data))
  
  
    return data_filt
    
    


In [33]:
stopwords_fp = "data/english_stopwords.txt"
data_filt_sw = apply_filter_stopwords(data=apply_lowercase(apply_tokenizer(data, tokenizer)), 
                                      stopwords_fp=stopwords_fp)
assert len(data_filt_sw) == 3770
assert len([w for s in data_filt_sw for w in s]) == 27733
assert data_filt_sw[8] == ['could', 'join', ',', 'second', 'afghan', 'war', 'broken', '.', 'landing']

#### Q2.d)

After filtering stopwords, we want to remove punctuation from the text as well. Make use of `string.punctuation` to do so.

In [34]:
def apply_filter_punkt(data):
    """
    Returns a list of lists, with no punctuation.
    
    Args:
    data - list with the tokenized data
    """

    # YOUR CODE HERE
    import string 
    exclude = set(string.punctuation)
    data =  list(map(lambda x: [i for i in x if not i in exclude], data))
    return data

In [35]:
data_filt_punkt = apply_filter_punkt(data=apply_tokenizer(data, tokenizer))

assert len(data_filt_punkt) == 3770
assert len([w for s in data_filt_punkt for w in s]) == 46362
assert data_filt_punkt[8] == ['I','could','join','it','the','second','Afghan','war','had','broken','out','On','landing','at']

#### Q2.e)

The last preprocessing step you are going to implement is stemming.

In [55]:
def apply_stemmer(data, stemmer):
    """
    Returns a list of lists, with stemmed data.
    
    Args:
    data - list with the tokenized data
    stemmer - instance of stemmer to use
    """

    # YOUR CODE HERE
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    data =  [[stemmer.stem(word) for word in sentence] for sentence in data]
    
    return list(data)  


In [60]:
stemmer = SnowballStemmer("english")
data_stems = apply_stemmer(data=apply_lowercase(apply_tokenizer(data, tokenizer)),
                           stemmer=stemmer)

assert len(data_stems) == 3770
assert len([w for s in data_stems for w in s]) == 51648
assert data_stems[8][-2] == 'land'

#### Q2.f)

Finally, join everything in a function, that applies the steps in the following order, in :
* Tokenization
* Lowercasing
* Filtering stopwords
* Filtering punctuation
* Stemming

In [73]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list, lower=True, remove_punct=True, stopwords=[]):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        self.stopwords = stopwords
    
    def clean_sentences(self, sentences):
                
        # Split sentence into list of words
        # sentences_tokens = ...
        # YOUR CODE HERE
        sentences_tokens = [tokenizer.tokenize(i) for i in sentences]
        
        # Lowercase
        if self.lower:
            # sentences_tokens = ...
            # YOUR CODE HERE
            sentences_tokens = [[w.lower() for w in line] for line in sentences]
            
        # Remove punctuation
        if self.remove_punct:
            # sentences_tokens = ...
            # YOUR CODE HERE
            exclude = set(string.punctuation)
            sentences_tokens = list(map(lambda x: [i for i in x if not i in exclude], sentences))

        if self.stopwords:
            # sentences_tokens = ...
            # YOUR CODE HERE
            stopwords = [line.strip() for line in open(stopwords_fp)]



            data=apply_lowercase(sentences)
            data_filt= list(map(lambda x: [i for i in x if not i.lower() in stopwords], sentences))
        # Stem words
        if self.stemmer:
            # sentences_tokens = ...
            # YOUR CODE HERE
            sentences_tokens = [[stemmer.stem(word) for word in sentence] for sentence in data]

        # Join list elements into string
        sentences_prep = [" ".join(tokens).strip() for tokens in sentences_tokens]
        return sentences_prep


In [78]:
text_cleaner = TextCleanerTransformer(
    regex_list=[],
    tokenizer=tokenizer, 
    stemmer=stemmer,
    lower=True, 
    remove_punct=True, 
    stopwords=stopwords_fp
)

data_preprocessed = text_cleaner.clean_sentences(data)
len([w for s in data_preprocessed for w in s.split()])

194523

In [75]:
text_cleaner = TextCleanerTransformer(
    regex_list=[],
    tokenizer=tokenizer, 
    stemmer=stemmer,
    lower=True, 
    remove_punct=True, 
    stopwords=stopwords_fp
)

data_preprocessed = text_cleaner.clean_sentences(data)
assert len(data_preprocessed) == 3770
assert len([w for s in data_preprocessed for w in s.split()]) == 22447
assert data_preprocessed[8] == 'could join second afghan war broken land'
assert data_preprocessed[15] == 'noth misfortun disast remov brigad'

AssertionError: 

## Q3. Movie reviews

We will now use what we've learned to explore movie reviews. We will start by analysing the dataset, then we will apply the preprocessing you implemented above, and finally we will see how it affects a classification task.

#### Q3.a)

To get some stats on the dataset, we will start by implementing your own function to get the list of n-grams from a list of tokens. Complete the function below:

In [128]:
def ngrams(data, n):
    """
    Returns list of tuples for all the n-grams
    
    Args:
    data - list of tokenized data (flattened)
    n - the n in n-grams
    """

    # YOUR CODE HERE
    from nltk import ngrams
    return list(ngrams(data, n))

In [129]:
ngrams("The actress won the oscar".split(), 2)

[('The', 'actress'), ('actress', 'won'), ('won', 'the'), ('the', 'oscar')]

In [130]:
assert ngrams("The actress won the oscar".split(), 2) == [('The', 'actress'), ('actress', 'won'), ('won', 'the'), ('the', 'oscar')]
assert ngrams("The actress won the oscar".split(), 3) == [('The', 'actress', 'won'), ('actress', 'won', 'the'), ('won', 'the', 'oscar')]
assert ngrams("The actress won the oscar".split(), 4) == [('The', 'actress', 'won', 'the'), ('actress', 'won', 'the', 'oscar')]

#### Q3.b)

We will now see in our dataset what are the most common n-grams. Load the data and find how many unique bi-grams, tri-grams and four-grams we have. Also, take advantage of `Counter` and `most_common()` to find the most common tri-gram. Merge together the words of the most common trigram to get one single string. (Hint: look at python's `join` function, exemplefied below when joining the full text)

In [131]:
# Load the dataset
df = pd.read_csv('data/imdb_sentiment.csv')

# Get the text and split into full list of words
docs = df['text']
full_text = ' '.join([d.strip() for d in docs])
words = full_text.split(' ')

Implement below the code to get the sets of unigrams, bigrams, trigrams and fourgrams, and to 

In [135]:
# unigrams = ...
# bigrams = ...
# trigrams = ...
# fourgrams = ...
# most_common_trigram = ...
#
# YOUR CODE HERE
unigrams = list(ngrams(words, 1))
bigrams = list(ngrams(words, 2))
trigrams = list(ngrams(words, 3))
fourgrams = list(ngrams(words, 4))
most_common_trigram = Counter(full_text.most_common())

AttributeError: 'str' object has no attribute 'most_common'

In [None]:
n_unigrams = str(len(unigrams))
n_bigrams = str(len(bigrams))
n_trigrams = str(len(trigrams))
n_fourgrams = str(len(fourgrams))

print('Found {} unigrams'.format(n_unigrams))
assert hashlib.sha256(n_unigrams.encode()).hexdigest() == '1ae2d8247d3ad491c79aed034828ba78b21e25438a6e9a61f252eb566e39e877'

print('Found {} bigrams'.format(n_bigrams))
assert hashlib.sha256(n_bigrams.encode()).hexdigest() == '7d2d487bcdf890f05578da49f574e3e8f22f7420f752071a24eb49759de5adf8'

print('Found {} trigrams'.format(n_trigrams))
assert hashlib.sha256(n_trigrams.encode()).hexdigest() == '8c54e3c7087ab053a77d56c60408fd47837081fdea817b7cc9e68f134cef969d'

print('Found {} fourgrams'.format(n_fourgrams))
assert hashlib.sha256(n_fourgrams.encode()).hexdigest() == '8df23d7f0d27298e7a7f77bdce4d15bb401098175c36514ba94b5350177b1593'

print('Most common trigram is "{}"'.format(most_common_trigram))
assert hashlib.sha256(most_common_trigram.encode()).hexdigest() == '28b6f04107ef3f1120975abf58ca8d08d20243beea929999b203f0add941fe16'


#### Q3.c)

Let's now process a sample of our dataset with the previous Q2 preprocessing, and get a Bag of Words representation. Start by using your text cleaner to get a preprocessed version of this dataset.

Note: if you didn't finish the text cleaner above, jump to the TF-IDF implementation directly, where you can load the BoW from a file.

In [None]:
text_cleaner = TextCleanerTransformer(
    regex_list=[],
    tokenizer=tokenizer, 
    stemmer=None,
    lower=True, 
    remove_punct=True, 
    stopwords=stopwords_fp
)

docs_preprocessed = text_cleaner.clean_sentences(docs[:200])

We can get a vocabulary, vectorize our dataset and convert it into a BoW

In [None]:
def build_vocabulary(docs):
    vocabulary = Counter()

    for doc in docs:
        words = doc.split()
        vocabulary.update(words)
    
    return OrderedDict(vocabulary.most_common())

def vectorize(docs):
    vocabulary = build_vocabulary(docs)
    vectors = []
    for doc in docs:
        words = doc.split()
        vector = np.array([doc.count(word) for word in vocabulary])
        vectors.append(vector)
    
    return (vocabulary, vectors)

def build_df(docs):
    vocab, vectors = vectorize(docs)
    return pd.DataFrame(vectors, columns=vocab)

BoW = build_df(docs_preprocessed)
BoW.head()

You will now implement one of TF-IDFs variation to compute from the bag of words the more relevant words. The formulation you should use is one you've learned before:

$$ tfidf _{t, d} =(tf_{t,d})*(log_2{(1 + \frac{N}{df_{t}})})  $$

Implement the TF-IDF below:

In [None]:
def tfidf(BoW_df):
    """
    Returns pandas dataframe of a tfidf representation from a BoW representation dataframe.

    Args:
    BoW_df - dataframe with document word counts (Bag of Words)
    """
    # tf = (...)
    
    # def _idf(column):
    #   return (...)
    
    # tf_idf = (...)
    
    # return tf_idf

    # YOUR CODE HERE
    raise NotImplementedError()

Let's now apply it to our previous BoW (note: load the BoW first if you could not use your text cleaner)

In [None]:
BoW = pd.read_csv('data/imdb_sentiment_bow_sample.csv')
BoW.head()

In [None]:
relevance = tfidf(BoW)

assert(math.isclose(relevance['movie'][0], 0.009717385023827248),
       math.isclose(relevance['film'][10], 0.019778475747522496),
       math.isclose(relevance['nice'][16], 0.010851136310680626),
       math.isclose(relevance['good'][128], 0.00989061193998239))

#### Q3.d)

Now, let's use scikit-learn to get to a similar matrix and relevance numbers. Load the full processed dataset:

In [None]:
# Load the dataset
df_preprocessed = pd.read_csv('data/imdb_sentiment_processed.csv')

# Get the processed text 
docs = df_preprocessed['text']

Start by transforming your documents into a matrix of tf-idf scores using sklearn. Make use of the `CountVectorizer` and the `TfidfTransformer` provided by scikitlearn. Implement a function that provided with a list of documents returns the word term frequency matrix and the corresponding vocabulary:

In [None]:
def build_word_term_frequency_matrix(docs):
    """
    Returns the matrix of word and tf-idf scores 
    
    Args:
    docs - list of documents in dataset
    """
    # vectorizer = ...
    # word_count_matrix = ...
    # vocabulary = ...
    # YOUR CODE HERE
    raise NotImplementedError()

    # tfidf = ...
    # word_term_frequency_matrix = ...
    # YOUR CODE HERE
    raise NotImplementedError()

    return (word_term_frequency_matrix, vocabulary)



Now get the corresponding string of the most important word of this document (with index `321`) according to TF-IDF.

In [None]:
index = 321

word_term_frequency_matrix, vocabulary = build_word_term_frequency_matrix(docs)

max_word_idx = word_term_frequency_matrix[index].argmax()
inv_vocab = {v: k for k, v in vocabulary.items()}
most_relevant_word = inv_vocab[max_word_idx]

assert(most_relevant_word == 'dull')

#### Q3.e)

Finally, let's try to classify the sentiment of these movie reviews. 

Build a Pipeline to classify a review as positive or negative. Use `MultinomialNB` as your final classifier, train it and get an accuracy score above 86% on the imdb validation dataset, by choosing the best set of parameters of `CountVectorizer()` and `TfidfTransformer()`, according to what we learned in Part III.

Hint: Try to use more than unigrams! Also, remember what we said about stopwords and feature space size in Part III of the Learning Notebooks?

In [None]:
# Split in train and validation
train_df, validation_df = train_test_split(df_preprocessed, test_size=0.3, random_state=42)

# Encode the labels
le = preprocessing.LabelEncoder()
le.fit(train_df['sentiment'].values)

train_df['sentiment'] = le.transform(train_df['sentiment'].values)
validation_df['sentiment'] = le.transform(validation_df['sentiment'].values)

In [None]:
def train_and_validate(train_df, validation_df):
    """
    Train a model using sklearn's Pipeline and return it along with its 
    current accuracy in the validation set. Assume the documents are already 
    preprocessed
    
    Args:
    train_df - dataframe with training docs
    validation_df - dataframe with validation docs
    """
    
    # Build the pipeline
    # text_clf = Pipeline(...)
    
    # Train the classifier
    # (...)

    # predicted = (...)
    # acc = (...)
    
    # return text_clf, acc
    
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
_, acc = train_and_validate(train_df, validation_df)
print("Accuracy: {}".format(acc))
assert(acc >= 0.86)