### Bag of Words & Stop Word Filtering

In [1]:
#Bag of Words & Stop Word Filtering

#Dataset

corpus = [
    'Linux has been around since the mid-1990s.',
    'Linux distributions include the Linux kernel.',
    'Linux is one of the most prominent open-source software.'
]

corpus

['Linux has been around since the mid-1990s.',
 'Linux distributions include the Linux kernel.',
 'Linux is one of the most prominent open-source software.']

In [2]:
#Bag of Words model dengan CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorized_X = vectorizer.fit_transform(corpus).todense()
vectorized_X

matrix([[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1],
        [0, 0, 0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]],
       dtype=int64)

In [3]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['1990s' 'around' 'been' 'distributions' 'has' 'include' 'is' 'kernel'
 'linux' 'mid' 'most' 'of' 'one' 'open' 'prominent' 'since' 'software'
 'source' 'the']


In [4]:
#Euclidean Distance untuk mengukur kedekatan/jarak antar dokumen (vector)

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

for i in range(len(vectorized_X)):
    for j in range(len(vectorized_X)):
        if i < j:
            vector_i = np.asarray(vectorized_X[i])
            vector_j = np.asarray(vectorized_X[j])
            jarak = euclidean_distances(vector_i, vector_j)
            print(f'Jarak dokumen {i+1} dan {j+1}: {jarak[0][0]}')

Jarak dokumen 1 dan 2: 3.1622776601683795
Jarak dokumen 1 dan 3: 3.7416573867739413
Jarak dokumen 2 dan 3: 3.4641016151377544


In [5]:
#Stop Word Filtering pada text

#Dataset

corpus

['Linux has been around since the mid-1990s.',
 'Linux distributions include the Linux kernel.',
 'Linux is one of the most prominent open-source software.']

In [6]:
#Stop Word Filtering dengan CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
vectorized_X = vectorizer.fit_transform(corpus).todense()
vectorized_X

matrix([[1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 1, 1, 1, 2, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 1, 1, 1, 1]], dtype=int64)

In [7]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['1990s' 'distributions' 'include' 'kernel' 'linux' 'mid' 'open'
 'prominent' 'software' 'source']
