## Importing Libraries
One can import all the libraries at a time in single cell or can import libraries on the go wherever needed.

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy

## Load the dataset

In [2]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [4]:
test_data.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [5]:
train_data = train_data[['excerpt']]
test_data = test_data[['excerpt']]

### Lower Casing
This is one of the basic pre-processing step. This is an important steps to perform as it helps you to convert all the strings into same casing format so that the texts like Lower, lower, and LOWER are considered same. This helps you to reduce the duplication of same word which might be counted as unique words.

In [6]:
train_data['excerpt_lower'] = train_data['excerpt'].str.lower()   # First we need to convert the given texts to string and then apply case conversion methods
test_data['excerpt_lower'] = test_data['excerpt'].str.lower()

In [7]:
train_data.head()

Unnamed: 0,excerpt,excerpt_lower
0,When the young people returned to the ballroom...,when the young people returned to the ballroom...
1,"All through dinner time, Mrs. Fayre was somewh...","all through dinner time, mrs. fayre was somewh..."
2,"As Roger had predicted, the snow departed as q...","as roger had predicted, the snow departed as q..."
3,And outside before the palace a great garden w...,and outside before the palace a great garden w...
4,Once upon a time there were Three Bears who li...,once upon a time there were three bears who li...


In [8]:
test_data.head()

Unnamed: 0,excerpt,excerpt_lower
0,My hope lay in Jack's promise that he would ke...,my hope lay in jack's promise that he would ke...
1,Dotty continued to go to Mrs. Gray's every nig...,dotty continued to go to mrs. gray's every nig...
2,It was a bright and cheerful scene that greete...,it was a bright and cheerful scene that greete...
3,Cell division is the process by which a parent...,cell division is the process by which a parent...
4,Debugging is the process of finding and resolv...,debugging is the process of finding and resolv...


### Removal of HTML tags & noise
This processing steps becomes handy when you are dealing with the scraped data from different websites.

In [9]:
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])

 * Default rule centered and clear of floats; sized for thought-breaks
 * ********************************************************************** */
	hr {
		width:45%;			/* adjust to ape original work */
		margin-top: 1em;	/* space above &amp;amp; below */
		margin-bottom: 1em;
		margin-left: auto;  /* these two ensure a.. */
		margin-right: auto; /* ..centered rule */
		clear: both;		/* don't let sidebars &amp;amp; floats overlap rule */
	}
/* ************************************************************************
 * Images and captions
 * ********************************************************************** */
	img { /* the default inline image has */
		border: 1px solid black; /* a thin black line border.. */
		padding: 6px; /* ..spaced a bit out from the graphic */
		} </style><link rel="schema.DCTERMS" href="http://purl.org/dc/terms/"/>
<link rel="schema.MARCREL" href="http://id.loc.gov/vocabulary/relators/"/>
<meta name="DCTERMS.title" content="The Bible, King 

In [10]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 1.2 MB/s 
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25l- \ done
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1273 sha256=8763161601e745dbec1ac6045aa2b15006c56bb9137fa9a1536538c84d1a2ca6
  Stored in directory: /root/.cache/pip/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.9.3 bs4-0.0.1 soupsieve-2.2.1


In [11]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    """
    This function will remove the HTML tags & noise from the scraped data.
    """
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])

*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***
This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.
Book 01        Genesis
01:001:001 In the beginning God created the heaven and the earth.
01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.
01:001:003 And God said, Let there be light: and there was light.
01:001:004 And God saw the light, that it was good: and God divided the
           light from the darkness.
01:001:005 And God called the light Day, and the darkness he called
           Night. And the evening and the morning were the first day.



You can observe in the texts above that all the tags like **br** or **img** are removed from the texts.

In [12]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(strip_html_tags)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(strip_html_tags)

### Removing Accented Characters
Sometimes we may find some accented characters present in the texts we are dealing with. 

Accented characters look like: 'Sómě Áccěntěd těxt'

In [13]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [14]:
s = 'Sómě Áccěntěd těxt'
s

'Sómě Áccěntěd těxt'

In [15]:
remove_accented_chars(s)

'Some Accented text'

In [16]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(remove_accented_chars)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(remove_accented_chars)

### Removing Special Characters, Numbers & Symbols


In [17]:
import re

def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

In [18]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

'Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂'

In [19]:
remove_special_characters(s, remove_digits=False)

'Well this was fun See you at 730 What do you think 9318 '

In [20]:
remove_special_characters(s)

'Well this was fun See you at  What do you think  '

In [21]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(remove_special_characters)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(remove_special_characters)

### Expanding Contractions

In [22]:
!pip install contractions
!pip install textsearch

Collecting contractions
  Downloading contractions-0.0.48-py2.py3-none-any.whl (6.4 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.2.0-py3-none-any.whl (283 kB)
[K     |████████████████████████████████| 283 kB 1.2 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 5.4 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l- \ | / done
[?25h  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=102914 sha256=b275f0b01563368d1170323eeca69d4f9ecbff7cc38bb15bcdd914f649c77464
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, co

In [23]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [24]:
import contractions

list(contractions.contractions_dict.items())[:10]

[("I'm", 'I am'),
 ("I'm'a", 'I am about to'),
 ("I'm'o", 'I am going to'),
 ("I've", 'I have'),
 ("I'll", 'I will'),
 ("I'll've", 'I will have'),
 ("I'd", 'I would'),
 ("I'd've", 'I would have'),
 ('Whatcha', 'What are you'),
 ("amn't", 'am not')]

In [25]:
contractions.fix(s)

'you all can not expand contractions I would think! You would not be able to. how did you do it?'

In [26]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(contractions.fix)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(contractions.fix)

### Stemming
Stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form

For example, if there are two words in the corpus walks and walking, then stemming will stem the suffix to make them walk. But say in another example, we have two words console and consoling, the stemmer will remove the suffix and make them consol which is not a proper english word.

There are several type of stemming algorithms available and one of the famous one is porter stemmer which is widely used. We can use nltk package for the same.

In [27]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [28]:
ps.stem('lying')

'lie'

In [29]:

ps.stem('strange')

'strang'

In [30]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(ps.stem)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(ps.stem)

### Lemmatization
Lemmatization is similar to stemming in reducing inflected words to their word stem but differs in the way that it makes sure the root word (also called as lemma) belongs to the language.

As a result, this one is generally slower than stemming process. So depending on the speed requirement, we can choose to use either stemming or lemmatization.

Let us use the WordNetLemmatizer in nltk to lemmatize our sentences

In [31]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [32]:
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word, pos='n') method of nltk.stem.wordnet.WordNetLemmatizer instance



In [33]:
# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('boxes', 'n'))

car
box


In [34]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(wnl.lemmatize)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(wnl.lemmatize)

### Tokenization
Tokenization is to split the entire paragraph or sentence into single words.

In [35]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'
tokens = nltk.word_tokenize(s)
print(tokens)

['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'dogs', '!']


In [36]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(nltk.word_tokenize)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(nltk.word_tokenize)

### Stopword Removal


In [37]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [38]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train_data["excerpt_lower"] = train_data["excerpt_lower"].apply(lambda text: remove_stopwords(text))
test_data["excerpt_lower"] = test_data["excerpt_lower"].apply(lambda text: remove_stopwords(text))