# Implementasi Vector Space Model Dalam Search Engine

##### Mengimport Corpus

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
      'Aldi memasak nasi',
      'Messi bermain bola bersama Ronaldo',
      'Ronaldo makan nasi',
      'Messi beli somay',
      'Yanto pergi bersama aldi ke Jakarta',
      'Aldi bermain game dengan Ronaldo',
      'Jakarta merupakan kota kelahiran Messi',
      'somay itu dibuat oleh Ronaldo',
      'Messi makan nasi dengan somay',
      'Yanto dan Messi makan somay sambil bermain bola',
      'Ronaldo memasak sayur sebelum pergi ke Jakarta',
      'Jakarta hujan setiap sore hari',
      'ALdi pergi ke jakarta menonton pertandingan bola',
         ]

##### nltk untuk pemrosesan kata & pandas untuk pengolahan data

In [60]:
#loading basic packages
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True


##### Fungsi Tokenizing dan stemming

In [61]:
# proses membagi text ke dalam token. Token merupakan rangkaian karakter yang bisa dipisahkan oleh spasi/tanda baca
def get_tokenized_list(doc_text):
    tokens = nltk.word_tokenize(doc_text)
    return tokens

# menjadikan kata dasar
def word_stemmer(token_list):
  ps = nltk.stem.PorterStemmer()
  stemmed = []
  for words in token_list:
    stemmed.append(ps.stem(words))
  return stemmed

In [62]:
#mengecek hasil yang sudah dilakukan diatas
tokens = get_tokenized_list(corpus[1])
print("WORD TOKENS:")
print(tokens)
doc_text = tokens
print("\nAFTER PERFORMING THE WORD STEMMING::")
doc_text = word_stemmer(doc_text)
doc_text

WORD TOKENS:
['Messi', 'bermain', 'bola', 'bersama', 'Ronaldo']

AFTER PERFORMING THE WORD STEMMING::


['messi', 'bermain', 'bola', 'bersama', 'ronaldo']

In [63]:
#menghubungkan kembali kalimat yang sebelumnya dipecah
doc_ = ' '.join(doc_text)
doc_

'messi bermain bola bersama ronaldo'

##### Menjalankan fungsi Tokenizing dan Stemming

In [64]:
cleaned_corpus = []
for doc in corpus:
  tokens = get_tokenized_list(doc)
  doc_text  = word_stemmer(tokens)
  doc_text = ' '.join(doc_text)
  cleaned_corpus.append(doc_text)
cleaned_corpus

['aldi memasak nasi',
 'messi bermain bola bersama ronaldo',
 'ronaldo makan nasi',
 'messi beli somay',
 'yanto pergi bersama aldi ke jakarta',
 'aldi bermain game dengan ronaldo',
 'jakarta merupakan kota kelahiran messi',
 'somay itu dibuat oleh ronaldo',
 'messi makan nasi dengan somay',
 'yanto dan messi makan somay sambil bermain bola',
 'ronaldo memasak sayur sebelum pergi ke jakarta',
 'jakarta hujan setiap sore hari',
 'aldi pergi ke jakarta menonton pertandingan bola']

##### Menghitung TF & IDF

In [65]:
vectorizerX = TfidfVectorizer()
vectorizerX.fit(cleaned_corpus)
doc_vector = vectorizerX.transform(cleaned_corpus)
print(vectorizerX.get_feature_names())

print(doc_vector.shape)

['aldi', 'beli', 'bermain', 'bersama', 'bola', 'dan', 'dengan', 'dibuat', 'game', 'hari', 'hujan', 'itu', 'jakarta', 'ke', 'kelahiran', 'kota', 'makan', 'memasak', 'menonton', 'merupakan', 'messi', 'nasi', 'oleh', 'pergi', 'pertandingan', 'ronaldo', 'sambil', 'sayur', 'sebelum', 'setiap', 'somay', 'sore', 'yanto']
(13, 33)




In [66]:
#Memasukan corpus yang sudah diproses kedalam dataframe dan divisualisasikan melalui table
df1 = pd.DataFrame(doc_vector.toarray(), columns=vectorizerX.get_feature_names())
df1

Unnamed: 0,aldi,beli,bermain,bersama,bola,dan,dengan,dibuat,game,hari,...,pergi,pertandingan,ronaldo,sambil,sayur,sebelum,setiap,somay,sore,yanto
0,0.513078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.465415,0.52485,0.465415,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.381647,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.501613,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.731686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.504104,0.0,0.0
4,0.366968,0.0,0.0,0.459329,0.0,0.0,0.0,0.0,0.0,0.0,...,0.407314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.459329
5,0.385362,0.0,0.42773,0.0,0.0,0.0,0.482352,0.0,0.559337,0.0,...,0.0,0.0,0.350745,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.508467,0.0,0.0,...,0.0,0.0,0.318846,0.0,0.0,0.0,0.0,0.350315,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.517107,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413129,0.0,0.0
9,0.0,0.0,0.330122,0.0,0.330122,0.431697,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.431697,0.0,0.0,0.0,0.297423,0.0,0.37228


#Preprocess the Query and transform it to vector

##### Proses Searching

In [67]:
query = input('ketikan sesuatu...')
query = get_tokenized_list(query)
q = []
for w in word_stemmer(query):
  q.append(w)
q = ' '.join(q)
q
query_vector = vectorizerX.transform([q])


#Penghitungan menggunakan Cosine Similarty
![alt text](https://i.stack.imgur.com/36r1U.png)

In [68]:
# proses penghitungan cosine similarities
from sklearn.metrics.pairwise import cosine_similarity
cosineSimilarities = cosine_similarity(doc_vector,query_vector).flatten()

In [69]:
related_docs_indices = cosineSimilarities.argsort()[:-6:-1]
print(related_docs_indices)

for i in related_docs_indices:
    data = [cleaned_corpus[i]]
    print(data)

[12  1  9  4  6]
['aldi pergi ke jakarta menonton pertandingan bola']
['messi bermain bola bersama ronaldo']
['yanto dan messi makan somay sambil bermain bola']
['yanto pergi bersama aldi ke jakarta']
['jakarta merupakan kota kelahiran messi']
