In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

### One hot encoding

In [2]:
text =" I love NLP very much"

In [3]:
pd.get_dummies(text.split())

Unnamed: 0,I,NLP,love,much,very
0,True,False,False,False,False
1,False,False,True,False,False
2,False,True,False,False,False
3,False,False,False,False,True
4,False,False,False,True,False


In [17]:
corpus = [
    'I love NLP, NLP love me',
    'I will learn NLP in 2month',
    'nlp is important. NLP is future.'
]

vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(corpus)

In [14]:
X.toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
       [1, 0, 0, 1, 0, 1, 0, 0, 1, 1],
       [0, 1, 1, 0, 1, 0, 0, 0, 1, 0]])

In [15]:
# Xem từ điển, giá trị của chúng là chỉ số của từ đó trong vector
print(vectorizer.vocabulary_)

{'love': 6, 'nlp': 8, 'me': 7, 'will': 9, 'learn': 5, 'in': 3, '2month': 0, 'is': 4, 'important': 2, 'future': 1}


In [7]:
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus)
df

Unnamed: 0,2month,future,important,in,is,learn,love,me,nlp,will
"I love NLP, NLP love me",0,0,0,0,0,0,1,1,1,0
I will learn NLP in 2month,1,0,0,1,0,1,0,0,1,1
nlp is important. NLP is future.,0,1,1,0,1,0,0,0,1,0


### Count vectoring

In [8]:
corpus = [
    'I love NLP, NLP love me',
    'I will learn NLP in 2month',
    'nlp is important. NLP is future.'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

X.toarray()

array([[0, 0, 0, 0, 0, 0, 2, 1, 2, 0],
       [1, 0, 0, 1, 0, 1, 0, 0, 1, 1],
       [0, 1, 1, 0, 2, 0, 0, 0, 2, 0]])

In [9]:
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus)
df

Unnamed: 0,2month,future,important,in,is,learn,love,me,nlp,will
"I love NLP, NLP love me",0,0,0,0,0,0,2,1,2,0
I will learn NLP in 2month,1,0,0,1,0,1,0,0,1,1
nlp is important. NLP is future.,0,1,1,0,2,0,0,0,2,0


### N-grams

In [11]:
corpus = [
    'I love NLP, NLP love me',
    'I will learn NLP in 2month',
    'nlp is important. NLP is future.'
]

# Tạo vector từ 2-gram (chỉ chọn từng cặp 2 từ liên tiếp)
vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform(corpus)

X.toarray()

array([[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1],
       [1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0]])

In [12]:
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus)
df

Unnamed: 0,important nlp,in 2month,is future,is important,learn nlp,love me,love nlp,nlp in,nlp is,nlp love,nlp nlp,will learn
"I love NLP, NLP love me",0,0,0,0,0,1,1,0,0,1,1,0
I will learn NLP in 2month,0,1,0,0,1,0,0,1,0,0,0,1
nlp is important. NLP is future.,1,0,1,1,0,0,0,0,2,0,0,0


### 4. Co-occurrence matrix (ma trận đồng xuất hiện)

In [19]:
import nltk
from nltk import bigrams
import itertools

In [22]:
sentences = [['Mary', 'is','not', 'fat'],
             ['Mary','is', 'not', 'thin']]
merged_words = list(itertools.chain.from_iterable(sentences))

In [23]:
merged_words

['Mary', 'is', 'not', 'fat', 'Mary', 'is', 'not', 'thin']

In [26]:
def co_occurrence_matrix(corpus):
    vocabularies = set(corpus)
    vocabularies = list(vocabularies)
    vocab_to_index = { word:i for i, word in enumerate(vocabularies) }

    bi_grams = list(bigrams(corpus))
    print('bigrams:', bi_grams)

In [27]:
co_occurrence_matrix(merged_words)

bigrams: [('Mary', 'is'), ('is', 'not'), ('not', 'fat'), ('fat', 'Mary'), ('Mary', 'is'), ('is', 'not'), ('not', 'thin')]
