In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer # frequency based DTM
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf based DTM

In [28]:
TEXT = ['banana apple apple eggplant',
        'orange carrot banana eggplant',
        'apple carrot banana banana',
        'orange banana grape'
]

TXT = "banana apple apple eggplant orange carrot banana eggplant apple carrot banana banana orange banana grape"


# CountVectorizer 이용하기

In [29]:
# tf_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
tf_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,2))  # 최수 빈도수 정의 # 그램 정의(유니그램(단일 단어)만 고려)
tf_features = tf_vectorizer.fit_transform(TEXT)

In [30]:
# n 그램 이해
import nltk
from nltk import ngrams

def getNgramWord(N,txt):
    txt = txt.split()
    ngrams = [txt[i:i+N] for i in range(len(txt)-N+1)]
    return ngrams

print("1-gram : " + str(getNgramWord(1, TXT))) # unigram
print("2-gram : " + str(getNgramWord(2, TXT))) # bigram
print("3-gram : " + str(getNgramWord(3, TXT))) # trigram

1-gram : [['banana'], ['apple'], ['apple'], ['eggplant'], ['orange'], ['carrot'], ['banana'], ['eggplant'], ['apple'], ['carrot'], ['banana'], ['banana'], ['orange'], ['banana'], ['grape']]
2-gram : [['banana', 'apple'], ['apple', 'apple'], ['apple', 'eggplant'], ['eggplant', 'orange'], ['orange', 'carrot'], ['carrot', 'banana'], ['banana', 'eggplant'], ['eggplant', 'apple'], ['apple', 'carrot'], ['carrot', 'banana'], ['banana', 'banana'], ['banana', 'orange'], ['orange', 'banana'], ['banana', 'grape']]
3-gram : [['banana', 'apple', 'apple'], ['apple', 'apple', 'eggplant'], ['apple', 'eggplant', 'orange'], ['eggplant', 'orange', 'carrot'], ['orange', 'carrot', 'banana'], ['carrot', 'banana', 'eggplant'], ['banana', 'eggplant', 'apple'], ['eggplant', 'apple', 'carrot'], ['apple', 'carrot', 'banana'], ['carrot', 'banana', 'banana'], ['banana', 'banana', 'orange'], ['banana', 'orange', 'banana'], ['orange', 'banana', 'grape']]


In [31]:
features = np.array(tf_features.todense())
print(np.linalg.norm(features[1]-features[2]))
print(np.linalg.norm(features[1]-features[0]))

2.8284271247461903
3.4641016151377544


In [32]:
# from sklearn.feature_extraction.text import CountVectorizer
feature_names = tf_vectorizer.get_feature_names_out()
feature_names

array(['apple', 'apple apple', 'apple carrot', 'apple eggplant', 'banana',
       'banana apple', 'banana banana', 'banana eggplant', 'banana grape',
       'carrot', 'carrot banana', 'eggplant', 'grape', 'orange',
       'orange banana', 'orange carrot'], dtype=object)

In [33]:
df = pd.DataFrame(data=features, columns=feature_names)
display(df)

Unnamed: 0,apple,apple apple,apple carrot,apple eggplant,banana,banana apple,banana banana,banana eggplant,banana grape,carrot,carrot banana,eggplant,grape,orange,orange banana,orange carrot
0,2,1,0,1,1,1,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,1,0,0,1,0,1,1,1,0,1,0,1
2,1,0,1,0,2,0,1,0,0,1,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,0,0,1,1,1,0


# TfidfVectorizer 이용하기

In [34]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,1))
tfidf_features = tfidf_vectorizer.fit_transform(TEXT)

In [35]:
tfidf_features = np.array(tfidf_features.todense())

In [36]:
df = pd.DataFrame(data=tfidf_features,
                  columns=tfidf_vectorizer.get_feature_names_out())
display(df)

Unnamed: 0,apple,banana,carrot,eggplant,grape,orange
0,0.857643,0.283833,0.0,0.428821,0.0,0.0
1,0.0,0.356966,0.539313,0.539313,0.0,0.539313
2,0.516233,0.683379,0.516233,0.0,0.0,0.0
3,0.0,0.379192,0.0,0.0,0.726641,0.572892
