## World To Vector

In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [3]:
paragraph = """
Millennia ago, the island used to be an extension of the Teknaf peninsula, but at a later time some portion of this peninsula got submerged and thus the southernmost part of the aforementioned peninsula became an island, and was disconnected from the Bangladesh mainland. The island was first settled in the 18th century by Arabian merchants, who named it "Jazira". During British occupation the island was named after the then Deputy Commissioner of Chittagong Mr. Martin as St. Martin Island.[3] Local names of the island are "Narikel Jinjira"[4] which means "Coconut Island", and "Daruchini Dwip" which means "Cinnamon Island". It is the only coral island in Bangladesh.
"""

In [4]:
paragraph

'\nMillennia ago, the island used to be an extension of the Teknaf peninsula, but at a later time some portion of this peninsula got submerged and thus the southernmost part of the aforementioned peninsula became an island, and was disconnected from the Bangladesh mainland. The island was first settled in the 18th century by Arabian merchants, who named it "Jazira". During British occupation the island was named after the then Deputy Commissioner of Chittagong Mr. Martin as St. Martin Island.[3] Local names of the island are "Narikel Jinjira"[4] which means "Coconut Island", and "Daruchini Dwip" which means "Cinnamon Island". It is the only coral island in Bangladesh.\n'

In [4]:
import spacy

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
nlp = spacy.load('en_core_web_sm')

### Praragraph to Word
- Paragraph to sentences
- Sentence to word
- We have to use tokenization

In [5]:
nltk.download('punkt')
sentences = nltk.sent_tokenize(paragraph)
sentences

[nltk_data] Downloading package punkt to /home/utpal108/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['\nMillennia ago, the island used to be an extension of the Teknaf peninsula, but at a later time some portion of this peninsula got submerged and thus the southernmost part of the aforementioned peninsula became an island, and was disconnected from the Bangladesh mainland.',
 'The island was first settled in the 18th century by Arabian merchants, who named it "Jazira".',
 'During British occupation the island was named after the then Deputy Commissioner of Chittagong Mr. Martin as St. Martin Island.',
 '[3] Local names of the island are "Narikel Jinjira"[4] which means "Coconut Island", and "Daruchini Dwip" which means "Cinnamon Island".',
 'It is the only coral island in Bangladesh.']

In [6]:
len(sentences)

5

#### Stemmer

In [7]:
stemmer = PorterStemmer()

In [8]:
stemmer.stem('goes')

'goe'

#### Lemmatizer

In [9]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /home/utpal108/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
lemmatizer.lemmatize('goes')

'go'

## Remove unnecessary words

In [1]:
import re

In [2]:
pattern = re.compile('<.*?>')
pattern

re.compile(r'<.*?>', re.UNICODE)

In [12]:
corpus = []
for sentence in sentences:
    review = re.sub('[^a-zA-Z]', ' ', sentence)
    review = review.lower().strip()
    corpus.append(review)

In [13]:
corpus

['millennia ago  the island used to be an extension of the teknaf peninsula  but at a later time some portion of this peninsula got submerged and thus the southernmost part of the aforementioned peninsula became an island  and was disconnected from the bangladesh mainland',
 'the island was first settled in the   th century by arabian merchants  who named it  jazira',
 'during british occupation the island was named after the then deputy commissioner of chittagong mr  martin as st  martin island',
 'local names of the island are  narikel jinjira     which means  coconut island   and  daruchini dwip  which means  cinnamon island',
 'it is the only coral island in bangladesh']

In [14]:
nltk.word_tokenize('   hello    my name is hdhd')

['hello', 'my', 'name', 'is', 'hdhd']

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/utpal108/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [17]:
# Streamming
streammed = []
for sentence in corpus:
    words = nltk.word_tokenize(sentence)
    for word in words:
        if word not in stopwords.words('english'):
            streammed_word = stemmer.stem(word)
            streammed.append(streammed_word)

In [18]:
streammed

['millennia',
 'ago',
 'island',
 'use',
 'extens',
 'teknaf',
 'peninsula',
 'later',
 'time',
 'portion',
 'peninsula',
 'got',
 'submerg',
 'thu',
 'southernmost',
 'part',
 'aforement',
 'peninsula',
 'becam',
 'island',
 'disconnect',
 'bangladesh',
 'mainland',
 'island',
 'first',
 'settl',
 'th',
 'centuri',
 'arabian',
 'merchant',
 'name',
 'jazira',
 'british',
 'occup',
 'island',
 'name',
 'deputi',
 'commission',
 'chittagong',
 'mr',
 'martin',
 'st',
 'martin',
 'island',
 'local',
 'name',
 'island',
 'narikel',
 'jinjira',
 'mean',
 'coconut',
 'island',
 'daruchini',
 'dwip',
 'mean',
 'cinnamon',
 'island',
 'coral',
 'island',
 'bangladesh']

In [19]:
# Lemmatization
lemmatized = []
for sentence in corpus:
    words = nltk.word_tokenize(sentence)
    for word in words:
        if word not in stopwords.words('english'):
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized.append(lemmatized_word)

In [20]:
lemmatized

['millennium',
 'ago',
 'island',
 'used',
 'extension',
 'teknaf',
 'peninsula',
 'later',
 'time',
 'portion',
 'peninsula',
 'got',
 'submerged',
 'thus',
 'southernmost',
 'part',
 'aforementioned',
 'peninsula',
 'became',
 'island',
 'disconnected',
 'bangladesh',
 'mainland',
 'island',
 'first',
 'settled',
 'th',
 'century',
 'arabian',
 'merchant',
 'named',
 'jazira',
 'british',
 'occupation',
 'island',
 'named',
 'deputy',
 'commissioner',
 'chittagong',
 'mr',
 'martin',
 'st',
 'martin',
 'island',
 'local',
 'name',
 'island',
 'narikel',
 'jinjira',
 'mean',
 'coconut',
 'island',
 'daruchini',
 'dwip',
 'mean',
 'cinnamon',
 'island',
 'coral',
 'island',
 'bangladesh']

### Text to vectorization Using Bag of Words

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)

In [22]:
## Apply Stopword with Lemmatization
new_corpus = []
for sentence in corpus:
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    new_sentence = ' '.join(words)
    new_corpus.append(new_sentence)

In [23]:
new_corpus[0]

'millennium ago island used extension teknaf peninsula later time portion peninsula got submerged thus southernmost part aforementioned peninsula became island disconnected bangladesh mainland'

In [24]:
corpus[0]

'millennia ago  the island used to be an extension of the teknaf peninsula  but at a later time some portion of this peninsula got submerged and thus the southernmost part of the aforementioned peninsula became an island  and was disconnected from the bangladesh mainland'

In [25]:
X = cv.fit_transform(new_corpus)

In [26]:
X[0].toarray()

array([[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
        1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
        1, 1]])

In [27]:
cv.vocabulary_

{'millennium': 28,
 'ago': 1,
 'island': 19,
 'used': 45,
 'extension': 16,
 'teknaf': 41,
 'peninsula': 35,
 'later': 22,
 'time': 44,
 'portion': 36,
 'got': 18,
 'submerged': 40,
 'thus': 43,
 'southernmost': 38,
 'part': 34,
 'aforementioned': 0,
 'became': 4,
 'disconnected': 14,
 'bangladesh': 3,
 'mainland': 24,
 'first': 17,
 'settled': 37,
 'th': 42,
 'century': 6,
 'arabian': 2,
 'merchant': 27,
 'named': 31,
 'jazira': 20,
 'british': 5,
 'occupation': 33,
 'deputy': 13,
 'commissioner': 10,
 'chittagong': 7,
 'mr': 29,
 'martin': 25,
 'st': 39,
 'local': 23,
 'name': 30,
 'narikel': 32,
 'jinjira': 21,
 'mean': 26,
 'coconut': 9,
 'daruchini': 12,
 'dwip': 15,
 'cinnamon': 8,
 'coral': 11}

### Text to vectorization Using TF IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(new_corpus)

In [29]:
new_corpus[0]

'millennium ago island used extension teknaf peninsula later time portion peninsula got submerged thus southernmost part aforementioned peninsula became island disconnected bangladesh mainland'

In [30]:
X[0].toarray()

array([[0.19048778, 0.19048778, 0.        , 0.15368434, 0.19048778,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.19048778,
        0.        , 0.19048778, 0.        , 0.19048778, 0.1815369 ,
        0.        , 0.        , 0.19048778, 0.        , 0.19048778,
        0.        , 0.        , 0.        , 0.19048778, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.19048778,
        0.57146335, 0.19048778, 0.        , 0.19048778, 0.        ,
        0.19048778, 0.19048778, 0.        , 0.19048778, 0.19048778,
        0.19048778]])