In [None]:
import nltk
nltk.download('names')

from nltk.corpus import names
names.words()

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


['Abagael',
 'Abagail',
 'Abbe',
 'Abbey',
 'Abbi',
 'Abbie',
 'Abby',
 'Abigael',
 'Abigail',
 'Abigale',
 'Abra',
 'Acacia',
 'Ada',
 'Adah',
 'Adaline',
 'Adara',
 'Addie',
 'Addis',
 'Adel',
 'Adela',
 'Adelaide',
 'Adele',
 'Adelice',
 'Adelina',
 'Adelind',
 'Adeline',
 'Adella',
 'Adelle',
 'Adena',
 'Adey',
 'Adi',
 'Adiana',
 'Adina',
 'Adora',
 'Adore',
 'Adoree',
 'Adorne',
 'Adrea',
 'Adria',
 'Adriaens',
 'Adrian',
 'Adriana',
 'Adriane',
 'Adrianna',
 'Adrianne',
 'Adrien',
 'Adriena',
 'Adrienne',
 'Aeriel',
 'Aeriela',
 'Aeriell',
 'Ag',
 'Agace',
 'Agata',
 'Agatha',
 'Agathe',
 'Aggi',
 'Aggie',
 'Aggy',
 'Agna',
 'Agnella',
 'Agnes',
 'Agnese',
 'Agnesse',
 'Agneta',
 'Agnola',
 'Agretha',
 'Aida',
 'Aidan',
 'Aigneis',
 'Aila',
 'Aile',
 'Ailee',
 'Aileen',
 'Ailene',
 'Ailey',
 'Aili',
 'Ailina',
 'Ailyn',
 'Aime',
 'Aimee',
 'Aimil',
 'Aina',
 'Aindrea',
 'Ainslee',
 'Ainsley',
 'Ainslie',
 'Ajay',
 'Alaine',
 'Alameda',
 'Alana',
 'Alanah',
 'Alane',
 'Alanna',
 

**Simple Text Classification**

In [None]:
def gender_features(word):
    return {'last_letter' : word[-1]}

In [None]:
gender_features('obama')

{'last_letter': 'a'}

In [None]:
gender_features('pranjal')

{'last_letter': 'l'}

In [None]:
len(names.words())

7944

In [None]:
labeled_names = ([(name,'male') for name in names.words('male.txt')] + [(name, 'female')  for name in names.words('female.txt')])


In [None]:
import random
random.shuffle(labeled_names)
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [None]:
train_set, test_set = featuresets[:5000], featuresets[-2000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
classifier.classify(gender_features('David'))

'male'

In [None]:
classifier.classify(gender_features('Pranjal'))


'male'

In [None]:
classifier.classify(gender_features('Tommy'))

'female'

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

0.7525


**Count Vectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(binary=True)
corpus = ["Tessaract is good optical character recognition engine  ", "optical character recognition is significant "]

In [None]:
vect.fit(corpus)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
vocab = vect.vocabulary_


In [None]:
for key in sorted(vocab.keys()):
    print("{}:{}".format(key, vocab[key]))

character:0
engine:1
good:2
is:3
optical:4
recognition:5
significant:6
tessaract:7


In [None]:
print(vect.transform(["This is a good optical illusion"]).toarray())

[[0 0 1 1 1 0 0 0]]


**Similarity Between Documents**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vect.transform(["Google Cloud Vision is a character recognition engine"]).toarray(), vect.transform(["OCR is an optical character recognition engine"]).toarray())
print(similarity)

[[0.89442719]]
