### Pre-processing the text data

- Removing weird spaces
- Tokenization
- Spelling correction
- Contraction mapping
- Stemming
- Emoji handling
- Stopwords handling
- Cleaning HTML

#### Removing weired spaces

In [37]:
data = "  hello This    is Sumathi. Isn't it beautiful?\t  "

In [38]:
def remove_space(text):
    text = text.strip()
    print(text)
    text = text.split()
    print(text)
    return " ".join(text)

In [39]:
data = remove_space(data)
data

hello This    is Sumathi. Isn't it beautiful?
['hello', 'This', 'is', 'Sumathi.', "Isn't", 'it', 'beautiful?']


"hello This is Sumathi. Isn't it beautiful?"

#### Tokenization

In [40]:
from nltk.tokenize import word_tokenize
data = word_tokenize(data)
data = [x.lower() for x in data]
data

['hello', 'this', 'is', 'sumathi', '.', 'is', "n't", 'it', 'beautiful', '?']

In [41]:
#### Spelling correction

In [42]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "everone"
correct_spellings(text)

'everyone'

In [43]:
correct_spellings("hallo how are you?")

'hallo how are you'

#### Contraction

In [44]:
contraction = {
 "'cause": 'because',
 ',cause': 'because',
 ';cause': 'because',
 "ain't": 'am not',
 'ain,t': 'am not',
 'ain;t': 'am not',
 'ain´t': 'am not',
 'ain’t': 'am not',
 "aren't": 'are not',
 'aren,t': 'are not',
 'aren;t': 'are not',
 'aren´t': 'are not',
 'aren’t': 'are not'
}

In [45]:
def mapping_replacer(x, dic):
    for word in dic.keys():
        if " " + word + " " in x:
            x = x.replace(" " + word + " ", " " + dic[word] + " ")
    return x

In [46]:
mapping_replacer('hello ;cause ',contraction)

'hello because '

#### Stemming

In [49]:
from nltk.stem import SnowballStemmer
s = SnowballStemmer("english")
s.stem("fishing")

'fish'

In [50]:
print(s.stem("fishes"))

fish


In [51]:
import nltk.stem
dir(nltk.stem)

['Cistem',
 'ISRIStemmer',
 'LancasterStemmer',
 'PorterStemmer',
 'RSLPStemmer',
 'RegexpStemmer',
 'SnowballStemmer',
 'StemmerI',
 'WordNetLemmatizer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'api',
 'cistem',
 'isri',
 'lancaster',
 'porter',
 'regexp',
 'rslp',
 'snowball',
 'util',
 'wordnet']

#### Emoji Handling

In [52]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

remove_emoji("game is on 🔥🔥")

'game is on '

In [62]:
import emoji
emojis = emoji.UNICODE_EMOJI

In [63]:
emojis

{'🥇': ':1st_place_medal:',
 '🥈': ':2nd_place_medal:',
 '🥉': ':3rd_place_medal:',
 '🆎': ':AB_button_(blood_type):',
 '🏧': ':ATM_sign:',
 '🅰': ':A_button_(blood_type):',
 '🇦🇫': ':Afghanistan:',
 '🇦🇱': ':Albania:',
 '🇩🇿': ':Algeria:',
 '🇦🇸': ':American_Samoa:',
 '🇦🇩': ':Andorra:',
 '🇦🇴': ':Angola:',
 '🇦🇮': ':Anguilla:',
 '🇦🇶': ':Antarctica:',
 '🇦🇬': ':Antigua_&_Barbuda:',
 '♒': ':Aquarius:',
 '🇦🇷': ':Argentina:',
 '♈': ':Aries:',
 '🇦🇲': ':Armenia:',
 '🇦🇼': ':Aruba:',
 '🇦🇨': ':Ascension_Island:',
 '🇦🇺': ':Australia:',
 '🇦🇹': ':Austria:',
 '🇦🇿': ':Azerbaijan:',
 '🔙': ':BACK_arrow:',
 '🅱': ':B_button_(blood_type):',
 '🇧🇸': ':Bahamas:',
 '🇧🇭': ':Bahrain:',
 '🇧🇩': ':Bangladesh:',
 '🇧🇧': ':Barbados:',
 '🇧🇾': ':Belarus:',
 '🇧🇪': ':Belgium:',
 '🇧🇿': ':Belize:',
 '🇧🇯': ':Benin:',
 '🇧🇲': ':Bermuda:',
 '🇧🇹': ':Bhutan:',
 '🇧🇴': ':Bolivia:',
 '🇧🇦': ':Bosnia_&_Herzegovina:',
 '🇧🇼': ':Botswana:',
 '🇧🇻': ':Bouvet_Island:',
 '🇧🇷': ':Brazil:',
 '🇮🇴': ':British_Indian_Ocean_Territory:',
 '🇻🇬': ':British_Vir

#### Stopwords Handling

In [53]:
from nltk.corpus import stopwords
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [54]:
data

['hello', 'this', 'is', 'sumathi', '.', 'is', "n't", 'it', 'beautiful', '?']

In [55]:
data = [x for x in data if x not in stopwords.words("english")]
data

['hello', 'sumathi', '.', "n't", 'beautiful', '?']

In [None]:
#### 

### Traditional handling of text data
- Hashing of words
- Count vectorization
- TF-IDF
- SVD

- TF(t) = Number of times a term t appears in a document / Total number of terms in the document
- IDF(t) = LOG( Total number of documents/Number of documents with term t in it )
- TF-IDF(t) = TF(t) * IDF(t)

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.shape)
print(X.toarray())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
(4, 9)
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.shape)
print(X.toarray())

['document', 'second']
(4, 2)
[[1.         0.        ]
 [0.78722298 0.61666846]
 [0.         0.        ]
 [1.         0.        ]]


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,3))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.shape)
print(X.toarray())

['document', 'document second', 'document second document', 'second', 'second document']
(4, 5)
[[1.         0.         0.         0.         0.        ]
 [0.53802897 0.42146317 0.42146317 0.42146317 0.42146317]
 [0.         0.         0.         0.         0.        ]
 [1.         0.         0.         0.         0.        ]]


In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer(ngram_range=(1,3))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.shape)
print(X.toarray())

['and', 'and this', 'and this is', 'document', 'document is', 'document is the', 'first', 'first document', 'is', 'is the', 'is the first', 'is the second', 'is the third', 'is this', 'is this the', 'one', 'second', 'second document', 'the', 'the first', 'the first document', 'the second', 'the second document', 'the third', 'the third one', 'third', 'third one', 'this', 'this document', 'this document is', 'this is', 'this is the', 'this the', 'this the first']
(4, 34)
[[0.         0.         0.         0.25307077 0.         0.
  0.31259275 0.31259275 0.20690194 0.25307077 0.39648427 0.
  0.         0.         0.         0.         0.         0.
  0.20690194 0.31259275 0.31259275 0.         0.         0.
  0.         0.         0.         0.20690194 0.         0.
  0.31259275 0.31259275 0.         0.        ]
 [0.         0.         0.         0.37077755 0.29044734 0.29044734
  0.         0.         0.15156747 0.18538877 0.         0.29044734
  0.         0.         0.         0.     

b'<!doctype HTML public "-//W3C//DTD HTML 4.0 Frameset//EN">\n\n<!-- saved from url=(0014)about:internet -->\n<html>\n\n<head>\n<meta http-equiv="content-type" content="text/html;charset=windows-1252">\n<title>Example of a simple HTML page</title>\n<meta name="generator" content="Adobe RoboHelp - www.adobe.com">\n<link rel="stylesheet" href="default_ns.css"><script type="text/javascript" language="JavaScript" title="WebHelpSplitCss">\n<!--\nif (navigator.appName !="Netscape")\n{   document.write("<link rel=\'stylesheet\' href=\'default.css\'>");}\n//-->\n</script>\n<style type="text/css">\n<!--\nimg_whs1 { border:none; width:301px; height:295px; float:none; }\np.whs2 { margin-bottom:5pt; }\np.whs3 { margin-bottom:9.5pt; }\n-->\n</style><script type="text/javascript" language="JavaScript" title="WebHelpInlineScript">\n<!--\nfunction reDo() {\n  if (innerWidth != origWidth || innerHeight != origHeight)\n     location.reload();\n}\nif ((parseInt(navigator.appVersion) == 4) && (navigator.a

In [33]:
x.content

b'<!doctype HTML public "-//W3C//DTD HTML 4.0 Frameset//EN">\n\n<!-- saved from url=(0014)about:internet -->\n<html>\n\n<head>\n<meta http-equiv="content-type" content="text/html;charset=windows-1252">\n<title>Example of a simple HTML page</title>\n<meta name="generator" content="Adobe RoboHelp - www.adobe.com">\n<link rel="stylesheet" href="default_ns.css"><script type="text/javascript" language="JavaScript" title="WebHelpSplitCss">\n<!--\nif (navigator.appName !="Netscape")\n{   document.write("<link rel=\'stylesheet\' href=\'default.css\'>");}\n//-->\n</script>\n<style type="text/css">\n<!--\nimg_whs1 { border:none; width:301px; height:295px; float:none; }\np.whs2 { margin-bottom:5pt; }\np.whs3 { margin-bottom:9.5pt; }\n-->\n</style><script type="text/javascript" language="JavaScript" title="WebHelpInlineScript">\n<!--\nfunction reDo() {\n  if (innerWidth != origWidth || innerHeight != origHeight)\n     location.reload();\n}\nif ((parseInt(navigator.appVersion) == 4) && (navigator.a

In [35]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(x.content,"html.parser")

In [36]:
print(soup.get_text())






Example of a simple HTML page


<!--
if (navigator.appName !="Netscape")
{   document.write("<link rel='stylesheet' href='default.css'>");}
//-->


<!--
img_whs1 { border:none; width:301px; height:295px; float:none; }
p.whs2 { margin-bottom:5pt; }
p.whs3 { margin-bottom:9.5pt; }
-->

<!--
function reDo() {
  if (innerWidth != origWidth || innerHeight != origHeight)
     location.reload();
}
if ((parseInt(navigator.appVersion) == 4) && (navigator.appName == "Netscape")) {
	origWidth = innerWidth;
	origHeight = innerHeight;
	onresize = reDo;
}
onerror = null; 
//-->


<!--
div.WebHelpPopupMenu { position:absolute; left:0px; top:0px; z-index:4; visibility:hidden; }
p.WebHelpNavBar { text-align:left; }
-->






<!--
if (window.gbWhTopic)
{
	if (window.setRelStartPage)
	{
	addTocInfo("Building your website\nCreating an EasySiteWizard 6 website\nExample of a simple HTML page");
addButton("show",BTN_TEXT,"Show","","","","",0,0,"whd_show0.gif","whd_show2.gif","whd_show1.gif");
addButton(

In [None]:
from  sklearn.decomposition import TruncatedSVD
sv = TruncatedSVD(10)