In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
text = ["Hello, my name is Sinem and I am a student"]
second_text = ["Who are you and are you a student too?"]

In [3]:
# Convert a collection of text to a matrix of token counts. This implementation produces a sparse representation of the counts
vectorizer = CountVectorizer()

# tokenize and build vocabulary
vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [4]:
# summerize
print(vectorizer.vocabulary_)

{'hello': 2, 'my': 4, 'name': 5, 'is': 3, 'sinem': 6, 'and': 1, 'am': 0, 'student': 7}


In [5]:
# encode document 
new_vector = vectorizer.transform(second_text)
print(new_vector.toarray())

[[0 1 0 0 0 0 0 1]]


index 1 and 7 have words in common; these are 'and' and 'student'

---

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv(('/Users/wolfsinem/product-tagging/data/flipkart_com-ecommerce_sample.csv'))
test_string = [df['description'][0]]
test_string

["Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Women's Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Women's Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts"]

In [8]:
vectorizer_test = CountVectorizer(stop_words='english')

In [9]:
vectorizer_test.fit(test_string)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
print(vectorizer_test.vocabulary_)

{'key': 16, 'features': 12, 'alisha': 1, 'solid': 28, 'women': 34, 'cycling': 9, 'shorts': 27, 'cotton': 8, 'lycra': 18, 'navy': 20, 'red': 25, 'specifications': 29, 'details': 10, 'number': 21, 'contents': 7, 'sales': 26, 'package': 23, 'pack': 22, 'fabric': 11, 'type': 31, 'general': 13, 'pattern': 24, 'ideal': 15, 'care': 5, 'gentle': 14, 'machine': 19, 'wash': 32, 'lukewarm': 17, 'water': 33, 'bleach': 3, 'additional': 0, 'style': 30, 'code': 6, 'altht_3p_21': 2, 'box': 4}


In [11]:
vectorizer_test.get_feature_names()

['additional',
 'alisha',
 'altht_3p_21',
 'bleach',
 'box',
 'care',
 'code',
 'contents',
 'cotton',
 'cycling',
 'details',
 'fabric',
 'features',
 'general',
 'gentle',
 'ideal',
 'key',
 'lukewarm',
 'lycra',
 'machine',
 'navy',
 'number',
 'pack',
 'package',
 'pattern',
 'red',
 'sales',
 'shorts',
 'solid',
 'specifications',
 'style',
 'type',
 'wash',
 'water',
 'women']

In [12]:
test_string_2 = [df['description'][1]]

new_vector_test = vectorizer_test.transform(test_string_2)
print(new_vector_test.toarray())

[[0 0 0 0 1 2 0 0 0 0 1 7 0 2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 8 0 3 0]]


In [13]:
df['description'][1]

"FabHomeDecor Fabric Double Sofa Bed (Finish Color - Leatherette Black Mechanism Type - Pull Out) Price: Rs. 22,646 • Fine deep seating experience • Save Space with the all new click clack Sofa Bed • Easy to fold and vice versa with simple click clack mechanism • Chrome legs with mango wood frame for long term durability • Double cushioned Sofa Bed to provide you with extra softness to make a fine seating experience • A double bed that can easily sleep two,Specifications of FabHomeDecor Fabric Double Sofa Bed (Finish Color - Leatherette Black Mechanism Type - Pull Out) Installation & Demo Installation & Demo Details Installation and demo for this product is done free of cost as part of this purchase. Our service partner will visit your location within 72 business hours from the delivery of the product. In The Box 1 Sofa Bed General Brand FabHomeDecor Mattress Included No Delivery Condition Knock Down Storage Included No Mechanism Type Pull Out Type Sofa Bed Style Contemporary & Modern 

In [14]:
new_vector_test.shape # there are 35 unique words

(1, 35)

---

### Trying on multiple descriptions

In [15]:
documents = [df['description'][0],df['description'][1],df['description'][3]]

In [16]:
cv = CountVectorizer(documents, stop_words='english')
count_vector = cv.fit_transform(documents)

In [17]:
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
cv.vocabulary_

{'key': 140,
 'features': 107,
 'alisha': 9,
 'solid': 237,
 'women': 277,
 'cycling': 65,
 'shorts': 228,
 'cotton': 60,
 'lycra': 156,
 'navy': 174,
 'red': 208,
 'specifications': 239,
 'details': 74,
 'number': 178,
 'contents': 57,
 'sales': 215,
 'package': 185,
 'pack': 184,
 'fabric': 105,
 'type': 258,
 'general': 121,
 'pattern': 188,
 'ideal': 134,
 'care': 35,
 'gentle': 122,
 'machine': 157,
 'wash': 272,
 'lukewarm': 155,
 'water': 273,
 'bleach': 25,
 'additional': 8,
 'style': 246,
 'code': 50,
 'altht_3p_21': 12,
 'box': 27,
 'fabhomedecor': 104,
 'double': 86,
 'sofa': 233,
 'bed': 23,
 'finish': 112,
 'color': 51,
 'leatherette': 145,
 'black': 24,
 'mechanism': 165,
 'pull': 204,
 'price': 196,
 'rs': 214,
 '22': 1,
 '646': 4,
 'fine': 111,
 'deep': 67,
 'seating': 220,
 'experience': 101,
 'save': 216,
 'space': 238,
 'new': 175,
 'click': 48,
 'clack': 42,
 'easy': 94,
 'fold': 117,
 'vice': 267,
 'versa': 266,
 'simple': 229,
 'chrome': 40,
 'legs': 147,
 'mango'

In [18]:
count_vector.shape # there are 3 different descriptions and 280 unique words. 

(3, 280)

In [19]:
cv.get_feature_names()

['1905',
 '22',
 '24',
 '40',
 '646',
 '72',
 '838',
 '939',
 'additional',
 'alisha',
 'allowed',
 'altght_11',
 'altht_3p_21',
 'appears',
 'applying',
 'area',
 'assembled',
 'assembly',
 'attachment',
 'avoid',
 'away',
 'based',
 'bay',
 'bed',
 'black',
 'bleach',
 'booking',
 'box',
 'brand',
 'bristled',
 'bru',
 'brush',
 'business',
 'cancellation',
 'capacity',
 'care',
 'carpentry',
 'case',
 'cause',
 'check',
 'chrome',
 'civil',
 'clack',
 'clean',
 'cleaned',
 'cleaner',
 'cleaning',
 'clear',
 'click',
 'cloth',
 'code',
 'color',
 'colour',
 'compared',
 'concerned',
 'condition',
 'contemporary',
 'contents',
 'corners',
 'cost',
 'cotton',
 'cover',
 'covered',
 'covers',
 'cushioned',
 'cycling',
 'damage',
 'deep',
 'defects',
 'delivering',
 'delivery',
 'demo',
 'depth',
 'desired',
 'details',
 'differences',
 'dimensions',
 'direct',
 'directly',
 'dirt',
 'disclaimer',
 'discretion',
 'displayed',
 'does',
 'domestic',
 'door',
 'double',
 'drilling',
 'drink