In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import re
import string 
from collections import Counter

objective: compare 

### 1.   Training vectors:
*   (n_samples, n_features)
*   n_samples is the number of samples and n_features is the number of features. 

### 2.   Label values: 
*   (n_samples,)
*   n_samples is the number of samples







In [2]:
text = ["My daughter is smart, my daughter is cute", 
        "I have a beautiful daughter",
        "it is very fluffy", 
        "it has a fluffy short tail"]

y_train = ["human", "human", "animal", "animal"]
print(np.shape(text), np.shape(y_train))
# if the train data is a list of strings, there would be erros, need to change to a numeric type.
# clf = MultinomialNB()
# clf.fit(document, label)

(4,) (4,)


In [3]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

clf.predict(count_vect.transform(["it is fluffy short tail"]))

array(['animal'], dtype='<U6')

In [0]:
def create_vocabulary(text):
    corpus = {}
    for idx, sent in enumerate(text):
            words = re.sub('['+string.punctuation+']', '', sent).split() 
            for word in words:
                if word not in corpus:
                    corpus[word] = []
    print(corpus)           
    return corpus

In [0]:
def create_word_counts(text, corpus):

    for idx, sent in enumerate(text):
        words = re.sub('['+string.punctuation+']', '', sent).split()

        temp = {}
        counts = Counter(words)
        for key, value in counts.items():
            temp[key] = value
        print(temp.keys())

        for word in corpus.keys():
            if word in temp.keys():
                corpus[word].append(temp[word])
            else:
                corpus[word].append(0)

    df_counts = pd.DataFrame(corpus)
    return df_counts

In [6]:
corpus = create_vocabulary(text)
df_counts = create_word_counts(text, corpus)
df_counts.head(5)

{'My': [], 'daughter': [], 'is': [], 'smart': [], 'my': [], 'cute': [], 'I': [], 'have': [], 'a': [], 'beautiful': [], 'it': [], 'very': [], 'fluffy': [], 'has': [], 'short': [], 'tail': []}
dict_keys(['My', 'daughter', 'is', 'smart', 'my', 'cute'])
dict_keys(['I', 'have', 'a', 'beautiful', 'daughter'])
dict_keys(['it', 'is', 'very', 'fluffy'])
dict_keys(['it', 'has', 'a', 'fluffy', 'short', 'tail'])


Unnamed: 0,My,daughter,is,smart,my,cute,I,have,a,beautiful,it,very,fluffy,has,short,tail
0,1,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,1,1,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0
3,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1


In [7]:
# TF = (Number of times the word appears in a document)/(Number of words in the document)
def create_tf(df_counts):
    row_sum = df_counts.sum(axis=1)
    df_tf = df_counts.div(row_sum, axis=0)
    df_tf = df_tf.round(2)
    return df_tf
 
df_tf = create_tf(df_counts)
df_tf.head(5)

Unnamed: 0,My,daughter,is,smart,my,cute,I,have,a,beautiful,it,very,fluffy,has,short,tail
0,0.12,0.25,0.25,0.12,0.12,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.25,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.17,0.0,0.17,0.17,0.17,0.17


In [14]:
# IDF = log(N/n), where, N is the number of documents and n is the number of documents a word t has appeared in.
# if a word shows up in all the documents, then log(N/n)=log(1)=0, if word shows up in only 1 document, then log(N/1)=log(N), relatively bigger value. 
def create_tfidf(df_counts, df_tf):
    doc_amount = df_counts.shape[0]
    no_show_amount = df_counts.isin([0]).sum()
    idf = np.log(doc_amount / (doc_amount - no_show_amount))
    tfidf = df_tf.mul(idf, axis=1)
    tfidf = tfidf.round(4)
    return tfidf
create_tfidf(df_counts, df_tf)



Unnamed: 0,My,daughter,is,smart,my,cute,I,have,a,beautiful,it,very,fluffy,has,short,tail
0,0.1664,0.1733,0.1733,0.1664,0.1664,0.1664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.1386,0.0,0.0,0.0,0.0,0.2773,0.2773,0.1386,0.2773,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.1733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1733,0.3466,0.1733,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1178,0.0,0.1178,0.0,0.1178,0.2357,0.2357,0.2357


In [15]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4, 13)

In [16]:
count_vect.vocabulary_

{'beautiful': 0,
 'cute': 1,
 'daughter': 2,
 'fluffy': 3,
 'has': 4,
 'have': 5,
 'is': 6,
 'it': 7,
 'my': 8,
 'short': 9,
 'smart': 10,
 'tail': 11,
 'very': 12}