In [1]:
#Task:- Crawling web page

In [2]:
from html.parser import HTMLParser  
from urllib.request import urlopen  
from urllib import parse
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re, math
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


class LinkParser(HTMLParser):

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for (key, value) in attrs:
                if key == 'href':
                    newUrl = parse.urljoin(self.baseUrl, value)
                    self.links = self.links + [newUrl]

    def getLinks(self, url):
        self.links = []
        self.baseUrl = url
        response = urlopen(url)
        if response.getheader('Content-Type')=='text/html':
            htmlBytes = response.read()
            htmlString = htmlBytes.decode("utf-8")
            self.feed(htmlString)
            return htmlString, self.links
        else:
            return "",[]

WORD = re.compile(r'\w+')
cosine_vals = []

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)
 
    
    
def spider(url, maxPages):
    pagesToVisit = [url]
    numberVisited = 0
    print(url)
    print(maxPages)
    
    query = "We make sure website fast , secure & always -so visitors & search engines trust"
    while numberVisited < maxPages and pagesToVisit != []:# and not foundWord:
        numberVisited = numberVisited + 1
        url = pagesToVisit[0]
        pagesToVisit = pagesToVisit[1:]
        
        print(str(numberVisited) + ". Visiting:", url)
        parser = LinkParser()
        data, links = parser.getLinks(url)
        web_data = BeautifulSoup(data,'lxml')    
        
        complete_text = ""
        for para in web_data.find_all('p'):
            complete_text = complete_text + para.text
        
        pagesToVisit = pagesToVisit + links
        
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(complete_text)
        
        filtered_sentence = []
        
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        
        filename = "File " + str(numberVisited) + ".txt"
        f = open(filename, "w+")
        f.write(" ".join(str(x) for x in filtered_sentence))
        f.close()
        
        
        vector1 = text_to_vector(query)
        vector2 = text_to_vector(" ".join(str(x) for x in filtered_sentence))
        
        cosine = get_cosine(vector1, vector2)
        
        print('Cosine:', cosine)
        
        cosine_vals.append([cosine, url])

def print_cosine():
    print(cosine_vals)

def print_decreasing():
    cosine_vals.sort(reverse = True)
    
    for url in cosine_vals:
        print(url[1])        
        
def calculate_similarity(maxPages, query):
    # Bring in standard stopwords
    stopWords = stopwords.words('english')
    
    print ("\nCalculating document similarity scores...")

    train_set = [query]
    for i in range(maxPages):
        # Open and read a bunch of files 
        f = open('File ' + str(i+1) + '.txt')
        doc = str(f.read())
        train_set.append(doc)

    # Set up the vectoriser, passing in the stop words
    tfidf_vectorizer = TfidfVectorizer(stop_words=stopWords)
    
    # Apply the vectoriser to the training set
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)
    
    # Print the score
    print ("\nSimilarity Score [*] ",cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train))

In [3]:
#spider("url", maxPages)
spider("http://www.dreamhost.com", 5)

http://www.dreamhost.com
5
1. Visiting: http://www.dreamhost.com
Cosine: 0.2050282630438012
2. Visiting: https://www.dreamhost.com/domains/club/
Cosine: 0.04705270219194202
3. Visiting: http://www.dreamhost.com
Cosine: 0.2050282630438012
4. Visiting: http://www.dreamhost.com/wordpress/
Cosine: 0.08421519210665192
5. Visiting: http://www.dreamhost.com/wordpress/
Cosine: 0.08421519210665192


In [4]:
#Task:- DIFFERENT CORPUS

In [5]:
import nltk

In [6]:
from nltk.corpus import brown

In [7]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [8]:
for i in range(5):
    print(brown.categories()[i])

adventure
belles_lettres
editorial
fiction
government


In [9]:
brown.words(categories='humor') 

['It', 'was', 'among', 'these', 'that', 'Hinkle', ...]

In [10]:
brown.words(categories='humor') [:20]

['It',
 'was',
 'among',
 'these',
 'that',
 'Hinkle',
 'identified',
 'a',
 'photograph',
 'of',
 'Barco',
 '!',
 '!',
 'For',
 'it',
 'seems',
 'that',
 'Barco',
 ',',
 'fancying']

In [11]:
brown.words(categories = 'adventure')[:80]

['Dan',
 'Morgan',
 'told',
 'himself',
 'he',
 'would',
 'forget',
 'Ann',
 'Turner',
 '.',
 'He',
 'was',
 'well',
 'rid',
 'of',
 'her',
 '.',
 'He',
 'certainly',
 "didn't",
 'want',
 'a',
 'wife',
 'who',
 'was',
 'fickle',
 'as',
 'Ann',
 '.',
 'If',
 'he',
 'had',
 'married',
 'her',
 ',',
 "he'd",
 'have',
 'been',
 'asking',
 'for',
 'trouble',
 '.',
 'But',
 'all',
 'of',
 'this',
 'was',
 'rationalization',
 '.',
 'Sometimes',
 'he',
 'woke',
 'up',
 'in',
 'the',
 'middle',
 'of',
 'the',
 'night',
 'thinking',
 'of',
 'Ann',
 ',',
 'and',
 'then',
 'could',
 'not',
 'get',
 'back',
 'to',
 'sleep',
 '.',
 'His',
 'plans',
 'and',
 'dreams',
 'had',
 'revolved',
 'around',
 'her']

In [12]:
from nltk.corpus import inaugural

In [13]:
inaugural.fileids()[:10]

['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt']

In [14]:
inaugural.words(fileids='2009-Obama.txt')[:20]

['My',
 'fellow',
 'citizens',
 ':',
 'I',
 'stand',
 'here',
 'today',
 'humbled',
 'by',
 'the',
 'task',
 'before',
 'us',
 ',',
 'grateful',
 'for',
 'the',
 'trust',
 'you']

In [15]:
a = ""
for i in range(100): 
    a = a + inaugural.words(fileids = '2009-Obama.txt')[i] + " "
print(a)

My fellow citizens : I stand here today humbled by the task before us , grateful for the trust you have bestowed , mindful of the sacrifices borne by our ancestors . I thank President Bush for his service to our nation , as well as the generosity and cooperation he has shown throughout this transition . Forty - four Americans have now taken the presidential oath . The words have been spoken during rising tides of prosperity and the still waters of peace . Yet , every so often the oath is taken amidst gathering clouds and raging storms 


In [16]:
from nltk.corpus import reuters

In [17]:
from nltk.corpus import stopwords

In [18]:
stopwords.words('german')[:10]

['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an']

In [19]:
import jieba

ModuleNotFoundError: No module named 'jieba'

In [20]:
seg = jieba.cut('医生或科学家检测用的', cut_all=True)
print(" ".join(seg))

NameError: name 'jieba' is not defined

In [21]:
entries=nltk.corpus.cmudict.entries()
len(entries)

133737

In [22]:
for entry in entries[10000:10015]:
    print(entry)

('belford', ['B', 'EH1', 'L', 'F', 'ER0', 'D'])
('belfry', ['B', 'EH1', 'L', 'F', 'R', 'IY0'])
('belgacom', ['B', 'EH1', 'L', 'G', 'AH0', 'K', 'AA0', 'M'])
('belgacom', ['B', 'EH1', 'L', 'JH', 'AH0', 'K', 'AA0', 'M'])
('belgard', ['B', 'EH0', 'L', 'G', 'AA1', 'R', 'D'])
('belgarde', ['B', 'EH0', 'L', 'G', 'AA1', 'R', 'D', 'IY0'])
('belge', ['B', 'EH1', 'L', 'JH', 'IY0'])
('belger', ['B', 'EH1', 'L', 'G', 'ER0'])
('belgian', ['B', 'EH1', 'L', 'JH', 'AH0', 'N'])
('belgians', ['B', 'EH1', 'L', 'JH', 'AH0', 'N', 'Z'])
('belgique', ['B', 'EH0', 'L', 'ZH', 'IY1', 'K'])
("belgique's", ['B', 'EH0', 'L', 'JH', 'IY1', 'K', 'S'])
('belgium', ['B', 'EH1', 'L', 'JH', 'AH0', 'M'])
("belgium's", ['B', 'EH1', 'L', 'JH', 'AH0', 'M', 'Z'])
('belgo', ['B', 'EH1', 'L', 'G', 'OW2'])


In [23]:
from nltk.corpus import wordnet as wn
wn.synsets('motorcar')

[Synset('car.n.01')]

In [24]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [25]:
nltk.download('names')
from nltk.corpus import names
import random
labeled_names=([(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\UTKARSH\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


In [26]:
labeled_names[:10],

([('Rosina', 'female'),
  ('Siana', 'female'),
  ('Bartlett', 'male'),
  ('Trace', 'female'),
  ('Philomena', 'female'),
  ('Emmeline', 'female'),
  ('Brit', 'female'),
  ('Suzy', 'female'),
  ('Sheelagh', 'female'),
  ('Laureen', 'female')],)

In [27]:
 labeled_names[-10:]

[('Tera', 'female'),
 ('Geo', 'male'),
 ('Fae', 'female'),
 ('Merla', 'female'),
 ('Shayla', 'female'),
 ('Ignacius', 'male'),
 ('Shari', 'female'),
 ('Beatrisa', 'female'),
 ('Nicolette', 'female'),
 ('Stern', 'male')]

In [28]:
#Task:- Stemmers, Tokenizers and Lemmatizers

In [29]:
from nltk.stem import PorterStemmer
stemmerporter = PorterStemmer()
from nltk.stem import WordNetLemmatizer

In [30]:
stemmerporter.stem('apples')

'appl'

In [31]:
stemmerporter.stem('utkarsh')

'utkarsh'

In [32]:
stemmerporter.stem('hello')

'hello'

In [33]:
stemmerporter.stem('programming')

'program'

In [34]:
stemmerporter.stem('harass')

'harass'

In [35]:
stemmerporter.stem('sing')

'sing'

In [36]:
stemmerporter.stem('singing')

'sing'

In [37]:
stemmerporter.stem('utkarshnies')

'utkarshni'

In [38]:
stemmerporter.stem('utkarshes')

'utkarsh'

In [39]:
stemmerporter.stem('utkarshing')

'utkarsh'

In [40]:
stemmerporter.stem('ants')

'ant'

In [41]:
stemmerporter.stem('cups')

'cup'

In [42]:
stemmerporter.stem('cupper')

'cupper'

In [43]:
stemmerporter.stem('cupes')

'cupe'

In [44]:
stemmerporter.stem('bushes')

'bush'

In [45]:
stemmerporter.stem('couples')

'coupl'

In [46]:
stemmerporter.stem('ashes')

'ash'

In [47]:
stemmerporter.stem('dances')

'danc'

In [48]:
stemmerporter.stem('manages')

'manag'

In [49]:
stemmerporter.stem('cues')

'cue'

In [50]:
stemmerporter.stem('computer')

'comput'

In [51]:
stemmerporter.stem('computing')

'comput'

In [52]:
stemmerporter.stem('computers')

'comput'

In [53]:
text = "In a country like India, where a mass of population lives in the villages, there are times when parents have work to take care of and cannot keep an eye on their children all day round. In many cases, when even the females in houses have to go to work, but cannot always carry their children along with them, they have no other choice but to leave them back at their homes. Due to these reasons and more, there has been a rapid increase in security risks of children recently. The current statistics report around 174 child missing cases on an everyday basis and about 505 are still not found in India. Is there a way to improve this situation? Is there any way by which parents can keep a track on their child even while not being around? Our project mainly aims at helping these parents out, so we propose a smart child tracking wearable system which helps them to keep track of the live location of the child at any given time. This system is implemented as a wearable band-like device which when wore by the child, helps in locating it."

In [54]:
text = [stemmerporter.stem(token) for token in text.split(" ")]

In [55]:
text = " ".join(text)
text

'In a countri like india, where a mass of popul live in the villages, there are time when parent have work to take care of and cannot keep an eye on their children all day round. In mani cases, when even the femal in hous have to go to work, but cannot alway carri their children along with them, they have no other choic but to leav them back at their homes. due to these reason and more, there ha been a rapid increas in secur risk of children recently. the current statist report around 174 child miss case on an everyday basi and about 505 are still not found in india. Is there a way to improv thi situation? Is there ani way by which parent can keep a track on their child even while not be around? our project mainli aim at help these parent out, so we propos a smart child track wearabl system which help them to keep track of the live locat of the child at ani given time. thi system is implement as a wearabl band-lik devic which when wore by the child, help in locat it.'

In [56]:
lemmatizer = WordNetLemmatizer()

In [57]:
text = "In a country like India, where a mass of population lives in the villages, there are times when parents have work to take care of and cannot keep an eye on their children all day round. In many cases, when even the females in houses have to go to work, but cannot always carry their children along with them, they have no other choice but to leave them back at their homes. Due to these reasons and more, there has been a rapid increase in security risks of children recently. The current statistics report around 174 child missing cases on an everyday basis and about 505 are still not found in India. Is there a way to improve this situation? Is there any way by which parents can keep a track on their child even while not being around? Our project mainly aims at helping these parents out, so we propose a smart child tracking wearable system which helps them to keep track of the live location of the child at any given time. This system is implemented as a wearable band-like device which when wore by the child, helps in locating it."
text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
text = " ".join(text)
text

'In a country like India, where a mass of population life in the villages, there are time when parent have work to take care of and cannot keep an eye on their child all day round. In many cases, when even the female in house have to go to work, but cannot always carry their child along with them, they have no other choice but to leave them back at their homes. Due to these reason and more, there ha been a rapid increase in security risk of child recently. The current statistic report around 174 child missing case on an everyday basis and about 505 are still not found in India. Is there a way to improve this situation? Is there any way by which parent can keep a track on their child even while not being around? Our project mainly aim at helping these parent out, so we propose a smart child tracking wearable system which help them to keep track of the live location of the child at any given time. This system is implemented a a wearable band-like device which when wore by the child, help

In [58]:
print(lemmatizer.lemmatize('better'))

better


In [59]:
print(lemmatizer.lemmatize('better', pos = 'a'))

good


In [60]:
text = "In a country like India, where a mass of population lives in the villages, there are times when parents have work to take care of and cannot keep an eye on their children all day round. In many cases, when even the females in houses have to go to work, but cannot always carry their children along with them, they have no other choice but to leave them back at their homes. Due to these reasons and more, there has been a rapid increase in security risks of children recently. The current statistics report around 174 child missing cases on an everyday basis and about 505 are still not found in India. Is there a way to improve this situation? Is there any way by which parents can keep a track on their child even while not being around? Our project mainly aims at helping these parents out, so we propose a smart child tracking wearable system which helps them to keep track of the live location of the child at any given time. This system is implemented as a wearable band-like device which when wore by the child, helps in locating it."
text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
text = " ".join(text)
print(lemmatizer.lemmatize(text, pos = 'a'))

In a country like India, where a mass of population life in the villages, there are time when parent have work to take care of and cannot keep an eye on their child all day round. In many cases, when even the female in house have to go to work, but cannot always carry their child along with them, they have no other choice but to leave them back at their homes. Due to these reason and more, there ha been a rapid increase in security risk of child recently. The current statistic report around 174 child missing case on an everyday basis and about 505 are still not found in India. Is there a way to improve this situation? Is there any way by which parent can keep a track on their child even while not being around? Our project mainly aim at helping these parent out, so we propose a smart child tracking wearable system which help them to keep track of the live location of the child at any given time. This system is implemented a a wearable band-like device which when wore by the child, help 

In [61]:
text = "mice"
text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
text = " ".join(text)
text

'mouse'

In [62]:
from nltk.stem.snowball import SnowballStemmer
snowstemmer = SnowballStemmer('english')

In [63]:
snowstemmer.stem('apples')

'appl'

In [64]:
snowstemmer.stem('manages')

'manag'

In [65]:
snowstemmer.stem('sing')

'sing'

In [66]:
snowstemmer.stem('cling')

'cling'

In [67]:
snowstemmer.stem('singing')

'sing'

In [68]:
snowstemmer.stem('clinging')

'cling'

In [69]:
snowstemmer.stem('shingling')

'shingl'

In [70]:
snowstemmer.stem('ashes')

'ash'

In [71]:
snowstemmer.stem('shuffles')

'shuffl'

In [72]:
snowstemmer.stem('cues')

'cue'

In [73]:
snowstemmer.stem('dumbles')

'dumbl'

In [74]:
snowstemmer.stem('queues')

'queue'

In [75]:
snowstemmer.stem('rues')

'rue'

In [76]:
snowstemmer.stem('clues')

'clue'

In [77]:
snowstemmer.stem('happiness')

'happi'

In [78]:
snowstemmer.stem('happyness')

'happy'

In [79]:
snowstemmer.stem('stupidity')

'stupid'

In [80]:
snowstemmer.stem('cruelty')

'cruelti'

In [81]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [82]:
stemmer.stem('queues')

'queu'

In [83]:
stemmer.stem('apples')

'appl'

In [84]:
stemmer.stem('cues')

'cue'

In [85]:
stemmer.stem('rues')

'rue'

In [86]:
stemmer.stem('clues')

'clu'

In [87]:
stemmer.stem('dumbles')

'dumbl'

In [88]:
stemmer.stem('happiness')

'happy'

In [89]:
stemmer.stem('sadness')

'sad'

In [90]:
stemmer.stem('sadddddness')

'sadddd'

In [91]:
stemmer.stem('saddness')

'sad'

In [92]:
stemmer.stem('saddddness')

'saddd'

In [93]:
stemmer.stem('computer')

'comput'

In [94]:
stemmer.stem('computing')

'comput'

In [95]:
stemmer.stem('stapler')

'stapl'

In [96]:
stemmer.stem('cwjwcber')

'cwjwcber'

In [97]:
stemmer.stem('uber')

'ub'

In [98]:
stemmer.stem('oober')

'oob'

In [99]:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('es')

In [100]:
stemmer.stem('apples')

'appl'

In [101]:
stemmer.stem('xsjbkdcwes')

'xsjbkdcw'

In [102]:
stemmer.stem('cwjwcbes')

'cwjwcb'

In [103]:
stemmer.stem('cwjwcber')

'cwjwcber'

In [104]:
from nltk.tokenize import TweetTokenizer
text = 'The greatness of kaichou wa maid sama >.< <3 :D #awesome @user32'
twtkn = TweetTokenizer()
twtkn.tokenize(text)

['The',
 'greatness',
 'of',
 'kaichou',
 'wa',
 'maid',
 'sama',
 '>',
 '.',
 '<',
 '<3',
 ':D',
 '#awesome',
 '@user32']

In [105]:
texts = "I read the newspaper with him. We read three pages back to back. Brat came for the breakfast too late. He has his own pace of doing things."
sent_text=nltk.sent_tokenize(texts)
for sentence in sent_text:
    tokenized_text = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    print(tagged)

[('I', 'PRP'), ('read', 'VBP'), ('the', 'DT'), ('newspaper', 'NN'), ('with', 'IN'), ('him', 'PRP'), ('.', '.')]
[('We', 'PRP'), ('read', 'VBP'), ('three', 'CD'), ('pages', 'NNS'), ('back', 'RB'), ('to', 'TO'), ('back', 'VB'), ('.', '.')]
[('Brat', 'NNP'), ('came', 'VBD'), ('for', 'IN'), ('the', 'DT'), ('breakfast', 'NN'), ('too', 'RB'), ('late', 'RB'), ('.', '.')]
[('He', 'PRP'), ('has', 'VBZ'), ('his', 'PRP$'), ('own', 'JJ'), ('pace', 'NN'), ('of', 'IN'), ('doing', 'VBG'), ('things', 'NNS'), ('.', '.')]


In [106]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

example = "The cat was chasing the mice"
example = [lemmatizer.lemmatize(token) for token in example.split(" ")]

In [107]:
print(" ".join(example))

The cat wa chasing the mouse


In [108]:
print(lemmatizer.lemmatize('better'))

better


In [109]:
print(lemmatizer.lemmatize('better', pos='a'))

good


In [110]:
from sklearn.feature_extraction.text import CountVectorizer

In [111]:
vect = CountVectorizer(binary=True)
corpus=["Tessaract is a good optical character recognition engine", "Optical character recognition is significant "]
vect.fit(corpus)
print(vect.transform(["Today is good optical"]).toarray())
print(vect.transform(corpus).toarray())

[[0 0 1 1 1 0 0 0]]
[[1 1 1 1 1 1 0 1]
 [1 0 0 1 1 1 1 0]]


In [112]:
vocab=vect.vocabulary_
for key in sorted(vocab.keys()):
    print("{}:{}".format(key, vocab[key]))

character:0
engine:1
good:2
is:3
optical:4
recognition:5
significant:6
tessaract:7


In [113]:
#Task:- Vectorizer and cosine similarity

In [114]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [115]:
vector = CountVectorizer(binary = True)
corpus = ["In a country like India, where a mass of population lives in the villages", "there are times when parents have work to take care of and cannot keep an eye on their children all day round", "In many cases, when even the females in houses have to go to work, but cannot always carry their children along with them", "they have no other choice but to leave them back at their homes. Due to these reasons and more, there has been a rapid increase in security risks of children recently"]
vector.fit(corpus)
print(vector.transform(["country population mass times parents parents parents hello"]).toarray())
print(vector.transform(["country population mass times parents parents parents hello"]))

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]]
  (0, 16)	1
  (0, 35)	1
  (0, 41)	1
  (0, 42)	1
  (0, 56)	1


In [116]:
vector = CountVectorizer(binary = False)
corpus = ["In a country like India, where a mass of population lives in the villages", "there are times when parents have work to take care of and cannot keep an eye on their children all day round", "In many cases, when even the females in houses have to go to work, but cannot always carry their children along with them", "they have no other choice but to leave them back at their homes. Due to these reasons and more, there has been a rapid increase in security risks of children recently"]
vector.fit(corpus)
v1 = vector.transform(["country population mass times parents parents parents hello"]).toarray()
print(v1)
v2 = vector.transform(["hello world country population so scared"]).toarray()
print(vector.transform(["country population mass times parents parents parents hello"]))

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]]
  (0, 16)	1
  (0, 35)	1
  (0, 41)	3
  (0, 42)	1
  (0, 56)	1


In [117]:
similarity = cosine_similarity(v1, v2)

In [118]:
print(similarity)

[[0.39223227]]


In [119]:
#Task:- Classifier

In [120]:
from nltk import NaiveBayesClassifier
def last_letter(word):
    return {'last_letter':word[-1]}

In [121]:
featuresets=[(last_letter(n), gender) for (n, gender) in labeled_names]
featuresets[:10]

[({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 't'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 't'}, 'female'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'h'}, 'female'),
 ({'last_letter': 'n'}, 'female')]

In [122]:
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = NaiveBayesClassifier.train(train_set)

In [123]:
classifier.classify(last_letter('James'))
classifier.classify(last_letter('Jessie'))
print(nltk.classify.accuracy(classifier, test_set))

0.76
