## Frequency-Based Text Vectorization

Metode yang digunakan untuk mengonversikan teks ke dalam bentuk vektor numerik berdasarkan frekuensi kemunculan kata-kata dalam teks tersebut.


1. Count Vectorizer

Pada latihan kali ini, library yang digunakan adalah library dari scikit-learn yang bisa diinstall dengan `pip install scikit-learn`

In [None]:
# !pip install scikit-learn
!pip install nlp_id

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from nlp_id import StopWord

stopword = StopWord()
stop_words = stopword.get_stopword()
print(stop_words)

['ada', 'adalagi', 'adalah', 'adanya', 'adapun', 'agak', 'agak-agak', 'agaknya', 'agar', 'aja', 'akan', 'akankah', 'akankan', 'akhir', 'akhir-akhirnya', 'akhirannya', 'akhiri', 'akhirinya', 'akhirnya', 'aku', 'akulah', 'akunya', 'akurat', 'akutu', 'ala', 'alamak', 'alhamdulillah', 'alhasil', 'amat', 'amat-amat', 'amatlah', 'anda', 'anda-anda', 'andai', 'andalah', 'antar', 'antara', 'antaranya', 'apa', 'apa-apanya', 'apaan', 'apabila', 'apaitu', 'apakah', 'apalagi', 'apanya', 'apasaja', 'apatah', 'apesnya', 'arah', 'artinya', 'asal', 'asalkan', 'asumsinya', 'asumsikan', 'atas', 'atas-atas', 'atas-batas', 'ataspun', 'atau', 'ataukah', 'ataupun', 'awal', 'awalnya', 'bagai', 'bagaikan', 'bagaimana', 'bagaimanakah', 'bagaimanapun', 'bagi', 'bagi-bagi', 'bagian', 'bagikan', 'bahasakan', 'bahasanya', 'bahkan', 'bahwa', 'bahwasanya', 'baik', 'bak', 'bakal', 'bakal-bakal', 'bakalan', 'balik', 'balikan', 'balikkan', 'banget', 'banget-banget', 'banyak', 'barangkali', 'baru', 'baru-baru', 'bawah',

In [9]:
dokumen = ["Pengertian Lagu Wajib beserta Ciri Ciri Lirik dan Contohnya"]

In [10]:
vectorizer = CountVectorizer()
count_vector = vectorizer.fit_transform(dokumen)

In [11]:
# menunjukkan vocabulary yang dihasilkan; normornya tidak dihitung, itu adalah posisi dalam sparse vektor.
vectorizer.vocabulary_

{'pengertian': 6,
 'lagu': 4,
 'wajib': 7,
 'beserta': 0,
 'ciri': 1,
 'lirik': 5,
 'dan': 3,
 'contohnya': 2}

In [12]:
# shape matriks. 1 dokumen, 8 kata unik
count_vector.shape

(1, 8)

In [13]:
# isi dari text vectorization
print(count_vector)

  (0, 6)	1
  (0, 4)	1
  (0, 7)	1
  (0, 0)	1
  (0, 1)	2
  (0, 5)	1
  (0, 3)	1
  (0, 2)	1


In [14]:
count_array = count_vector.toarray()
df = pd.DataFrame(data=count_array, columns = vectorizer.get_feature_names_out()) # untuk scikit learn baru, menggunakan get_feature_names()
df

Unnamed: 0,beserta,ciri,contohnya,dan,lagu,lirik,pengertian,wajib
0,1,2,1,1,1,1,1,1


In [15]:
# dengan data lebih banyak
data = [
    "Pengertian Lagu Wajib beserta Ciri Lirik dan Contohnya",
    "Lirik Lagu Indonesia Raya",
    "Penjelasan Mengenai Metronome dan Fungsinya dalam Musik",
    "Ulasan tentang Ekonomi Kreatif beserta Ciri-Ciri dan Manfaatnya",
    "Ragam Kerajinan Khas yang Terdapat di Indonesia"
]

In [16]:
vectorizer = CountVectorizer()
count_vector = vectorizer.fit_transform(data)

In [17]:
vectorizer.vocabulary_

{'pengertian': 18,
 'lagu': 12,
 'wajib': 25,
 'beserta': 0,
 'ciri': 1,
 'lirik': 13,
 'dan': 4,
 'contohnya': 2,
 'indonesia': 8,
 'raya': 21,
 'penjelasan': 19,
 'mengenai': 15,
 'metronome': 16,
 'fungsinya': 7,
 'dalam': 3,
 'musik': 17,
 'ulasan': 24,
 'tentang': 22,
 'ekonomi': 6,
 'kreatif': 11,
 'manfaatnya': 14,
 'ragam': 20,
 'kerajinan': 9,
 'khas': 10,
 'yang': 26,
 'terdapat': 23,
 'di': 5}

In [24]:
print(count_vector)

  (0, 17)	1
  (0, 11)	1
  (0, 24)	1
  (0, 0)	1
  (0, 1)	1
  (0, 12)	1
  (0, 2)	1
  (1, 11)	1
  (1, 12)	1
  (1, 7)	1
  (1, 20)	1
  (2, 18)	1
  (2, 14)	1
  (2, 15)	1
  (2, 6)	1
  (2, 3)	1
  (2, 16)	1
  (3, 0)	1
  (3, 1)	2
  (3, 23)	1
  (3, 21)	1
  (3, 5)	1
  (3, 10)	1
  (3, 13)	1
  (4, 7)	1
  (4, 19)	1
  (4, 8)	1
  (4, 9)	1
  (4, 22)	1
  (4, 4)	1


In [18]:
count_vector.shape

(5, 27)

In [19]:
count_array = count_vector.toarray()
df = pd.DataFrame(data=count_array, columns = vectorizer.get_feature_names_out()) # untuk scikit learn baru, menggunakan get_feature_names()
df

Unnamed: 0,beserta,ciri,contohnya,dalam,dan,di,ekonomi,fungsinya,indonesia,kerajinan,...,musik,pengertian,penjelasan,ragam,raya,tentang,terdapat,ulasan,wajib,yang
0,1,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,1,1,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0
3,1,2,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,0,0,0,0,0,1,0,0,1,1,...,0,0,0,1,0,0,1,0,0,1




* Customize Count Vectorizer

In [20]:
# menggunkanan stop_words yang sudah didefinisikan
stop_words = ["dan", "yang"]

vectorizer = CountVectorizer(stop_words=stop_words)
count_vector=vectorizer.fit_transform(data)
count_vector.shape

(5, 25)

In [23]:
vectorizer.vocabulary_

{'pengertian': 17,
 'lagu': 11,
 'wajib': 24,
 'beserta': 0,
 'ciri': 1,
 'lirik': 12,
 'contohnya': 2,
 'indonesia': 7,
 'raya': 20,
 'penjelasan': 18,
 'mengenai': 14,
 'metronome': 15,
 'fungsinya': 6,
 'dalam': 3,
 'musik': 16,
 'ulasan': 23,
 'tentang': 21,
 'ekonomi': 5,
 'kreatif': 10,
 'manfaatnya': 13,
 'ragam': 19,
 'kerajinan': 8,
 'khas': 9,
 'terdapat': 22,
 'di': 4}

In [None]:
print(count_vector)

In [26]:
long_text = ["Pengolahan bahasa alami (disingkat PBA; bahasa Inggris: natural language processing, disingkat NLP) adalah cabang ilmu komputer, \
linguistik, dan kecerdasan buatan yang mengkaji interaksi antara komputer dan bahasa (alami) manusia, khususnya cara memprogram komputer untuk mengolah data bahasa alami \
dalam jumlah besar. Hasilnya adalah komputer mampu 'memahami' isi dokumen, termasuk nuansa bahasa di dalamnya. Dengan ini, komputer dapat dengan akurat mengambil informasi \
dan wawasan dari dokumen sekaligus mengelompokkan dan menata dokumen-dokumen itu sendiri. \
Kajian NLP antara lain mencakup segmentasi wicara, segmentasi teks, penandaan kelas kata, dan pengawataksaan makna. Meski kajiannya dapat mencakup teks dan wicara, \
pengolahan wicara telah berkembang menjadi suatu bidang kajian terpisah."]

In [34]:
vectorizer = CountVectorizer(stop_words = stop_words)
count_vector = vectorizer.fit_transform(long_text)
count_vector.shape



(1, 47)

In [35]:
print(count_vector)

  (0, 40)	2
  (0, 1)	5
  (0, 0)	3
  (0, 9)	2
  (0, 37)	1
  (0, 14)	1
  (0, 34)	1
  (0, 22)	1
  (0, 41)	1
  (0, 35)	2
  (0, 6)	1
  (0, 12)	1
  (0, 21)	5
  (0, 23)	1
  (0, 19)	1
  (0, 5)	1
  (0, 32)	1
  (0, 15)	1
  (0, 25)	1
  (0, 27)	1
  (0, 33)	1
  (0, 8)	1
  (0, 3)	1
  (0, 11)	1
  (0, 26)	1
  (0, 16)	1
  (0, 10)	4
  (0, 36)	1
  (0, 7)	1
  (0, 30)	1
  (0, 13)	1
  (0, 45)	1
  (0, 31)	1
  (0, 28)	1
  (0, 17)	2
  (0, 29)	2
  (0, 42)	2
  (0, 46)	3
  (0, 43)	2
  (0, 38)	1
  (0, 20)	1
  (0, 39)	1
  (0, 24)	1
  (0, 18)	1
  (0, 2)	1
  (0, 4)	1
  (0, 44)	1


In [36]:
vectorizer.vocabulary_

{'pengolahan': 40,
 'bahasa': 1,
 'alami': 0,
 'disingkat': 9,
 'pba': 37,
 'inggris': 14,
 'natural': 34,
 'language': 22,
 'processing': 41,
 'nlp': 35,
 'cabang': 6,
 'ilmu': 12,
 'komputer': 21,
 'linguistik': 23,
 'kecerdasan': 19,
 'buatan': 5,
 'mengkaji': 32,
 'interaksi': 15,
 'manusia': 25,
 'memprogram': 27,
 'mengolah': 33,
 'data': 8,
 'besar': 3,
 'hasilnya': 11,
 'memahami': 26,
 'isi': 16,
 'dokumen': 10,
 'nuansa': 36,
 'dalamnya': 7,
 'mengambil': 30,
 'informasi': 13,
 'wawasan': 45,
 'mengelompokkan': 31,
 'menata': 28,
 'kajian': 17,
 'mencakup': 29,
 'segmentasi': 42,
 'wicara': 46,
 'teks': 43,
 'penandaan': 38,
 'kelas': 20,
 'pengawataksaan': 39,
 'makna': 24,
 'kajiannya': 18,
 'berkembang': 2,
 'bidang': 4,
 'terpisah': 44}

In [None]:
# menggunakan MIN_DF sebagai tambahan stop_words
# abaikan kata yang muncul di kurang dari 25% dokumen

In [42]:
vectorizer = CountVectorizer(stop_words=stop_words, min_df=0.25)
count_vector=vectorizer.fit_transform(long_text)
count_vector.shape

(1, 47)

In [59]:
print(count_vector)

  (0, 40)	2
  (0, 1)	5
  (0, 0)	3
  (0, 9)	2
  (0, 37)	1
  (0, 14)	1
  (0, 34)	1
  (0, 22)	1
  (0, 41)	1
  (0, 35)	2
  (0, 6)	1
  (0, 12)	1
  (0, 21)	5
  (0, 23)	1
  (0, 19)	1
  (0, 5)	1
  (0, 32)	1
  (0, 15)	1
  (0, 25)	1
  (0, 27)	1
  (0, 33)	1
  (0, 8)	1
  (0, 3)	1
  (0, 11)	1
  (0, 26)	1
  (0, 16)	1
  (0, 10)	4
  (0, 36)	1
  (0, 7)	1
  (0, 30)	1
  (0, 13)	1
  (0, 45)	1
  (0, 31)	1
  (0, 28)	1
  (0, 17)	2
  (0, 29)	2
  (0, 42)	2
  (0, 46)	3
  (0, 43)	2
  (0, 38)	1
  (0, 20)	1
  (0, 39)	1
  (0, 24)	1
  (0, 18)	1
  (0, 2)	1
  (0, 4)	1
  (0, 44)	1


In [43]:
vectorizer.vocabulary_

{'pengolahan': 40,
 'bahasa': 1,
 'alami': 0,
 'disingkat': 9,
 'pba': 37,
 'inggris': 14,
 'natural': 34,
 'language': 22,
 'processing': 41,
 'nlp': 35,
 'cabang': 6,
 'ilmu': 12,
 'komputer': 21,
 'linguistik': 23,
 'kecerdasan': 19,
 'buatan': 5,
 'mengkaji': 32,
 'interaksi': 15,
 'manusia': 25,
 'memprogram': 27,
 'mengolah': 33,
 'data': 8,
 'besar': 3,
 'hasilnya': 11,
 'memahami': 26,
 'isi': 16,
 'dokumen': 10,
 'nuansa': 36,
 'dalamnya': 7,
 'mengambil': 30,
 'informasi': 13,
 'wawasan': 45,
 'mengelompokkan': 31,
 'menata': 28,
 'kajian': 17,
 'mencakup': 29,
 'segmentasi': 42,
 'wicara': 46,
 'teks': 43,
 'penandaan': 38,
 'kelas': 20,
 'pengawataksaan': 39,
 'makna': 24,
 'kajiannya': 18,
 'berkembang': 2,
 'bidang': 4,
 'terpisah': 44}

frrekuansi paling tinggi dari count vector adalah bahasa dan komputer

2. TF-IDF

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
dokumen = ["Pengertian Lagu Wajib beserta Ciri Ciri Lirik dan Contohnya"]

In [46]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(dokumen)

In [47]:
vectorizer.vocabulary_

{'pengertian': 6,
 'lagu': 4,
 'wajib': 7,
 'beserta': 0,
 'ciri': 1,
 'lirik': 5,
 'dan': 3,
 'contohnya': 2}

In [48]:
print(tfidf)

  (0, 2)	0.30151134457776363
  (0, 3)	0.30151134457776363
  (0, 5)	0.30151134457776363
  (0, 1)	0.6030226891555273
  (0, 0)	0.30151134457776363
  (0, 7)	0.30151134457776363
  (0, 4)	0.30151134457776363
  (0, 6)	0.30151134457776363


In [49]:
# dengan data lebih banyak
data = [
    "Pengertian Lagu Wajib beserta Ciri Lirik dan Contohnya",
    "Lirik Lagu Indonesia Raya",
    "Penjelasan Mengenai Metronome dan Fungsinya dalam Musik",
    "Ulasan tentang Ekonomi Kreatif beserta Ciri-Ciri dan Manfaatnya",
    "Ragam Kerajinan Khas yang Terdapat di Indonesia"
]

In [50]:
# menggunakan data lebih banyak
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data)

In [51]:
tfidf.shape

(5, 27)

In [52]:
print(tfidf)

  (0, 2)	0.40648465275893914
  (0, 4)	0.27222750934939444
  (0, 13)	0.32794925841041295
  (0, 1)	0.32794925841041295
  (0, 0)	0.32794925841041295
  (0, 25)	0.40648465275893914
  (0, 12)	0.32794925841041295
  (0, 18)	0.40648465275893914
  (1, 21)	0.58195149786718
  (1, 8)	0.4695148042146437
  (1, 13)	0.4695148042146437
  (1, 12)	0.4695148042146437
  (2, 17)	0.39379498998448487
  (2, 3)	0.39379498998448487
  (2, 7)	0.39379498998448487
  (2, 16)	0.39379498998448487
  (2, 15)	0.39379498998448487
  (2, 19)	0.39379498998448487
  (2, 4)	0.26372909429700125
  (3, 14)	0.33897148063305216
  (3, 11)	0.33897148063305216
  (3, 6)	0.33897148063305216
  (3, 22)	0.33897148063305216
  (3, 24)	0.33897148063305216
  (3, 4)	0.22701315114087783
  (3, 1)	0.5469601124734934
  (3, 0)	0.2734800562367467
  (4, 5)	0.38775666010579296
  (4, 23)	0.38775666010579296
  (4, 26)	0.38775666010579296
  (4, 10)	0.38775666010579296
  (4, 9)	0.38775666010579296
  (4, 20)	0.38775666010579296
  (4, 8)	0.3128396318588854


In [53]:
tfidf_array = tfidf.toarray()
df = pd.DataFrame(data=tfidf_array, columns = vectorizer.get_feature_names_out()) # untuk scikit learn baru, menggunakan get_feature_names()
df

Unnamed: 0,beserta,ciri,contohnya,dalam,dan,di,ekonomi,fungsinya,indonesia,kerajinan,...,musik,pengertian,penjelasan,ragam,raya,tentang,terdapat,ulasan,wajib,yang
0,0.327949,0.327949,0.406485,0.0,0.272228,0.0,0.0,0.0,0.0,0.0,...,0.0,0.406485,0.0,0.0,0.0,0.0,0.0,0.0,0.406485,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.469515,0.0,...,0.0,0.0,0.0,0.0,0.581951,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.393795,0.263729,0.0,0.0,0.393795,0.0,0.0,...,0.393795,0.0,0.393795,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.27348,0.54696,0.0,0.0,0.227013,0.0,0.338971,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.338971,0.0,0.338971,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.387757,0.0,0.0,0.31284,0.387757,...,0.0,0.0,0.0,0.387757,0.0,0.0,0.387757,0.0,0.0,0.387757


* Customize TF-IDF

In [56]:
# menggunkanan stop_words yang sudah didefinisikan
# DAN menggunakan MIN_DF sebagai tambahan stop_words
# abaikan kata yang muncul di kurang dari 25% dokumen
# stop_words = ["dan", "yang"]

vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=0.25)
tfidf=vectorizer.fit_transform(long_text)
tfidf.shape



(1, 47)

In [58]:
print(tfidf)

  (0, 44)	0.08247860988423225
  (0, 4)	0.08247860988423225
  (0, 2)	0.08247860988423225
  (0, 18)	0.08247860988423225
  (0, 24)	0.08247860988423225
  (0, 39)	0.08247860988423225
  (0, 20)	0.08247860988423225
  (0, 38)	0.08247860988423225
  (0, 43)	0.1649572197684645
  (0, 46)	0.24743582965269675
  (0, 42)	0.1649572197684645
  (0, 29)	0.1649572197684645
  (0, 17)	0.1649572197684645
  (0, 28)	0.08247860988423225
  (0, 31)	0.08247860988423225
  (0, 45)	0.08247860988423225
  (0, 13)	0.08247860988423225
  (0, 30)	0.08247860988423225
  (0, 7)	0.08247860988423225
  (0, 36)	0.08247860988423225
  (0, 10)	0.329914439536929
  (0, 16)	0.08247860988423225
  (0, 26)	0.08247860988423225
  (0, 11)	0.08247860988423225
  (0, 3)	0.08247860988423225
  (0, 8)	0.08247860988423225
  (0, 33)	0.08247860988423225
  (0, 27)	0.08247860988423225
  (0, 25)	0.08247860988423225
  (0, 15)	0.08247860988423225
  (0, 32)	0.08247860988423225
  (0, 5)	0.08247860988423225
  (0, 19)	0.08247860988423225
  (0, 23)	0.0824786098

In [57]:
vectorizer.vocabulary_

{'pengolahan': 40,
 'bahasa': 1,
 'alami': 0,
 'disingkat': 9,
 'pba': 37,
 'inggris': 14,
 'natural': 34,
 'language': 22,
 'processing': 41,
 'nlp': 35,
 'cabang': 6,
 'ilmu': 12,
 'komputer': 21,
 'linguistik': 23,
 'kecerdasan': 19,
 'buatan': 5,
 'mengkaji': 32,
 'interaksi': 15,
 'manusia': 25,
 'memprogram': 27,
 'mengolah': 33,
 'data': 8,
 'besar': 3,
 'hasilnya': 11,
 'memahami': 26,
 'isi': 16,
 'dokumen': 10,
 'nuansa': 36,
 'dalamnya': 7,
 'mengambil': 30,
 'informasi': 13,
 'wawasan': 45,
 'mengelompokkan': 31,
 'menata': 28,
 'kajian': 17,
 'mencakup': 29,
 'segmentasi': 42,
 'wicara': 46,
 'teks': 43,
 'penandaan': 38,
 'kelas': 20,
 'pengawataksaan': 39,
 'makna': 24,
 'kajiannya': 18,
 'berkembang': 2,
 'bidang': 4,
 'terpisah': 44}

frekuensi paling tinggi adalah komputer dan bahasa

2. Co-Occurance Matrix

* Matriks A menyimpan co-occurance words.

* Dalam metode ini, kita menghitung berapa kali setiap kata muncul di dalam window dengan ukuran tertentu di sekitar kata yang dituju.

* Hitung jumlah ini untuk semua kata dalam korpus.

In [60]:
import nltk
import numpy as np
import pandas as pd

In [84]:
sentences=['saya suka makan',
           'saya suka nonton',
           'saya benci tidur']

In [85]:
kata = list(set((" ".join(sentences)).split()))

In [86]:
# split kalimat
text_data=[]
for i in sentences:
    text_data.append(i.split())
text_data

[['saya', 'suka', 'makan'],
 ['saya', 'suka', 'nonton'],
 ['saya', 'benci', 'tidur']]

In [87]:
num_words = [len(sentence.split()) for sentence in sentences]
min_words =  min(num_words)
min_words

3

In [88]:
window_size = 2

w = []
for data in sentences:
    data = data.split(' ') #if data is list
    for index_1 in range(len(data) - window_size+1):
        for index_2 in range(index_1 + 1, index_1 + window_size + 1):
            if index_2 <= len(data)-1:
                w.append([data[index_1], data[index_2]])

In [89]:
len(w)

9

In [90]:
w

[['saya', 'suka'],
 ['saya', 'makan'],
 ['suka', 'makan'],
 ['saya', 'suka'],
 ['saya', 'nonton'],
 ['suka', 'nonton'],
 ['saya', 'benci'],
 ['saya', 'tidur'],
 ['benci', 'tidur']]

In [91]:
w1 = [x[::-1] for x in w]
w.extend(w1)
len(w)

18

In [92]:
array = np.zeros((len(kata), len(kata)))

In [93]:
df = pd.DataFrame(array, index = kata, columns = kata)

In [94]:
df

Unnamed: 0,nonton,benci,suka,tidur,makan,saya
nonton,0.0,0.0,0.0,0.0,0.0,0.0
benci,0.0,0.0,0.0,0.0,0.0,0.0
suka,0.0,0.0,0.0,0.0,0.0,0.0
tidur,0.0,0.0,0.0,0.0,0.0,0.0
makan,0.0,0.0,0.0,0.0,0.0,0.0
saya,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
# memasukan value w ke dalam array
for i in w:
    df.at[i[0], i[1]]+=1

In [None]:
sentences=['saya suka makan',
           'saya suka nonton',
           'saya benci tidur']

In [96]:
#Co-occurence matrix
df

Unnamed: 0,nonton,benci,suka,tidur,makan,saya
nonton,0.0,0.0,1.0,0.0,0.0,1.0
benci,0.0,0.0,0.0,1.0,0.0,1.0
suka,1.0,0.0,0.0,0.0,1.0,2.0
tidur,0.0,1.0,0.0,0.0,0.0,1.0
makan,0.0,0.0,1.0,0.0,0.0,1.0
saya,1.0,1.0,2.0,1.0,1.0,0.0


3. N-Gram

N-Gram sebagai fitur Metode lain, contoh TF-IDF

In [97]:
data = [
    "Pengertian Lagu Wajib beserta Ciri Lirik dan Contohnya",
    "Lirik Lagu Indonesia Raya",
    "Penjelasan Mengenai Metronome dan Fungsinya dalam Musik",
    "Ulasan tentang Ekonomi Kreatif beserta Ciri-Ciri dan Manfaatnya",
    "Ragam Kerajinan Khas yang Terdapat di Indonesia"
]

In [98]:
vectorizer = TfidfVectorizer(ngram_range=(1,1)) # unigram
tfidf = vectorizer.fit_transform(data)
vectorizer.vocabulary_

{'pengertian': 18,
 'lagu': 12,
 'wajib': 25,
 'beserta': 0,
 'ciri': 1,
 'lirik': 13,
 'dan': 4,
 'contohnya': 2,
 'indonesia': 8,
 'raya': 21,
 'penjelasan': 19,
 'mengenai': 15,
 'metronome': 16,
 'fungsinya': 7,
 'dalam': 3,
 'musik': 17,
 'ulasan': 24,
 'tentang': 22,
 'ekonomi': 6,
 'kreatif': 11,
 'manfaatnya': 14,
 'ragam': 20,
 'kerajinan': 9,
 'khas': 10,
 'yang': 26,
 'terdapat': 23,
 'di': 5}

In [99]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # unigram - bigram
tfidf = vectorizer.fit_transform(data)
vectorizer.vocabulary_

{'pengertian': 39,
 'lagu': 27,
 'wajib': 52,
 'beserta': 0,
 'ciri': 2,
 'lirik': 30,
 'dan': 9,
 'contohnya': 6,
 'pengertian lagu': 40,
 'lagu wajib': 29,
 'wajib beserta': 53,
 'beserta ciri': 1,
 'ciri lirik': 5,
 'lirik dan': 31,
 'dan contohnya': 10,
 'indonesia': 19,
 'raya': 45,
 'lirik lagu': 32,
 'lagu indonesia': 28,
 'indonesia raya': 20,
 'penjelasan': 41,
 'mengenai': 34,
 'metronome': 36,
 'fungsinya': 17,
 'dalam': 7,
 'musik': 38,
 'penjelasan mengenai': 42,
 'mengenai metronome': 35,
 'metronome dan': 37,
 'dan fungsinya': 11,
 'fungsinya dalam': 18,
 'dalam musik': 8,
 'ulasan': 50,
 'tentang': 46,
 'ekonomi': 15,
 'kreatif': 25,
 'manfaatnya': 33,
 'ulasan tentang': 51,
 'tentang ekonomi': 47,
 'ekonomi kreatif': 16,
 'kreatif beserta': 26,
 'ciri ciri': 3,
 'ciri dan': 4,
 'dan manfaatnya': 12,
 'ragam': 43,
 'kerajinan': 21,
 'khas': 23,
 'yang': 54,
 'terdapat': 48,
 'di': 13,
 'ragam kerajinan': 44,
 'kerajinan khas': 22,
 'khas yang': 24,
 'yang terdapat': 55,

In [102]:
vectorizer = TfidfVectorizer(ngram_range=(1,3)) # trigram
tfidf = vectorizer.fit_transform(data)
vectorizer.vocabulary_

{'pengertian': 56,
 'lagu': 38,
 'wajib': 75,
 'beserta': 0,
 'ciri': 4,
 'lirik': 43,
 'dan': 14,
 'contohnya': 11,
 'pengertian lagu': 57,
 'lagu wajib': 41,
 'wajib beserta': 76,
 'beserta ciri': 1,
 'ciri lirik': 9,
 'lirik dan': 44,
 'dan contohnya': 15,
 'pengertian lagu wajib': 58,
 'lagu wajib beserta': 42,
 'wajib beserta ciri': 77,
 'beserta ciri lirik': 3,
 'ciri lirik dan': 10,
 'lirik dan contohnya': 45,
 'indonesia': 27,
 'raya': 65,
 'lirik lagu': 46,
 'lagu indonesia': 39,
 'indonesia raya': 28,
 'lirik lagu indonesia': 47,
 'lagu indonesia raya': 40,
 'penjelasan': 59,
 'mengenai': 49,
 'metronome': 52,
 'fungsinya': 24,
 'dalam': 12,
 'musik': 55,
 'penjelasan mengenai': 60,
 'mengenai metronome': 50,
 'metronome dan': 53,
 'dan fungsinya': 16,
 'fungsinya dalam': 25,
 'dalam musik': 13,
 'penjelasan mengenai metronome': 61,
 'mengenai metronome dan': 51,
 'metronome dan fungsinya': 54,
 'dan fungsinya dalam': 17,
 'fungsinya dalam musik': 26,
 'ulasan': 72,
 'tentang

In [101]:
print(tfidf)

  (0, 13)	0.40824829046386296
  (0, 4)	0.40824829046386296
  (0, 1)	0.40824829046386296
  (0, 23)	0.40824829046386296
  (0, 12)	0.40824829046386296
  (0, 17)	0.40824829046386296
  (1, 11)	0.7071067811865475
  (1, 14)	0.7071067811865475
  (2, 7)	0.4472135954999579
  (2, 5)	0.4472135954999579
  (2, 16)	0.4472135954999579
  (2, 15)	0.4472135954999579
  (2, 18)	0.4472135954999579
  (3, 3)	0.3779644730092272
  (3, 2)	0.3779644730092272
  (3, 0)	0.3779644730092272
  (3, 10)	0.3779644730092272
  (3, 6)	0.3779644730092272
  (3, 20)	0.3779644730092272
  (3, 22)	0.3779644730092272
  (4, 21)	0.4472135954999579
  (4, 24)	0.4472135954999579
  (4, 9)	0.4472135954999579
  (4, 8)	0.4472135954999579
  (4, 19)	0.4472135954999579


In [109]:
vectorizer = TfidfVectorizer(ngram_range=(3,3), stop_words=stop_words) # unigram
tfidf = vectorizer.fit_transform(long_text)
vectorizer.vocabulary_

{'pengolahan bahasa alami': 56,
 'bahasa alami disingkat': 4,
 'alami disingkat pba': 1,
 'disingkat pba bahasa': 16,
 'pba bahasa inggris': 53,
 'bahasa inggris natural': 7,
 'inggris natural language': 24,
 'natural language processing': 49,
 'language processing disingkat': 36,
 'processing disingkat nlp': 58,
 'disingkat nlp cabang': 15,
 'nlp cabang ilmu': 50,
 'cabang ilmu komputer': 12,
 'ilmu komputer linguistik': 22,
 'komputer linguistik kecerdasan': 32,
 'linguistik kecerdasan buatan': 37,
 'kecerdasan buatan mengkaji': 29,
 'buatan mengkaji interaksi': 11,
 'mengkaji interaksi komputer': 47,
 'interaksi komputer bahasa': 25,
 'komputer bahasa alami': 31,
 'bahasa alami manusia': 5,
 'alami manusia memprogram': 2,
 'manusia memprogram komputer': 39,
 'memprogram komputer mengolah': 41,
 'komputer mengolah data': 35,
 'mengolah data bahasa': 48,
 'data bahasa alami': 14,
 'bahasa alami besar': 3,
 'alami besar hasilnya': 0,
 'besar hasilnya komputer': 9,
 'hasilnya komputer m

tfidf+ngram(unigram&bigram): bahasa & komputer(0.33)

tfidf+bigram: bahasa alami(0.35)

tfidf+trigram: rata

In [110]:
print(tfidf)

  (0, 10)	0.12216944435630522
  (0, 8)	0.12216944435630522
  (0, 64)	0.12216944435630522
  (0, 57)	0.12216944435630522
  (0, 65)	0.12216944435630522
  (0, 62)	0.12216944435630522
  (0, 44)	0.12216944435630522
  (0, 28)	0.12216944435630522
  (0, 38)	0.12216944435630522
  (0, 55)	0.12216944435630522
  (0, 30)	0.12216944435630522
  (0, 54)	0.12216944435630522
  (0, 61)	0.12216944435630522
  (0, 59)	0.12216944435630522
  (0, 66)	0.12216944435630522
  (0, 60)	0.12216944435630522
  (0, 43)	0.12216944435630522
  (0, 51)	0.12216944435630522
  (0, 27)	0.12216944435630522
  (0, 18)	0.12216944435630522
  (0, 17)	0.12216944435630522
  (0, 42)	0.12216944435630522
  (0, 46)	0.12216944435630522
  (0, 19)	0.12216944435630522
  (0, 63)	0.12216944435630522
  :	:
  (0, 41)	0.12216944435630522
  (0, 39)	0.12216944435630522
  (0, 2)	0.12216944435630522
  (0, 5)	0.12216944435630522
  (0, 31)	0.12216944435630522
  (0, 25)	0.12216944435630522
  (0, 47)	0.12216944435630522
  (0, 11)	0.12216944435630522
  (0, 2