# 1. Ambil datanya (Data Collection)

In [43]:
corpus = open("positive.txt", "r").read() # ini bakal jadi corpus saya

# 2. Data Preprocessing

In [44]:
import nltk
from string import punctuation
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Tokenize the corpus
sent_corpus = sent_tokenize(corpus)
corpus = word_tokenize(corpus)

# Remove stopwords
stop_words = set(stopwords.words('english'))
corpus = [word for word in corpus if word.lower() not in stop_words]

# Remove punctuation
corpus = [word for word in corpus if word not in punctuation]

# Remove numbers
corpus = [word for word in corpus if word.isalpha()]

cleaned_corpus = " ".join(corpus)

# N-grams

In [45]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
class NGramLanguageModel:
    def __init__(self, n):
        self.n = n # represents the n in n-gram
        self.vectorizer = CountVectorizer(analyzer="word", ngram_range=(n, n)) # membuat object CountVectorizer 

    def fit_transform(self, corpus):
        return self.vectorizer.fit_transform(corpus) # menghitung frekuensi kemunculan kata dan mengubahnya menjadi vector yang dapat diolah oleh model
    
    def transform(self, corpus):
        return self.vectorizer.transform(corpus) 
    
def calculate_cosine_similarity(matrix, query_vector):
    similarities = cosine_similarity(query_vector, matrix) # menghitung cosine similarity antara matrix dan vector query.
    return similarities

In [47]:
query = input("Masukkan query: ").lower()

n = 1
# buat object dari class NGramLanguageModel
ngram_language_model = NGramLanguageModel(n)

# ubah file corpus kita ke bentuk matrix
matrix = ngram_language_model.fit_transform(corpus)
# ubah file query kita ke bentuk vector
query_vector = ngram_language_model.transform([query])

print(f"{n}-gram Model: ")
data = matrix.A

# print data corpus kita dalam bentuk dataframe
print(pd.DataFrame(data, columns=ngram_language_model.vectorizer.get_feature_names_out()))
print(query_vector.A)

1-gram Model: 
      able  absolutely  accuracy  accurate  accurately  adjustable  adjusting  \
0        0           0         0         0           0           0          0   
1        0           0         0         0           0           0          0   
2        0           0         0         0           0           0          0   
3        0           0         0         0           0           0          0   
4        0           0         0         0           0           0          0   
...    ...         ...       ...       ...         ...         ...        ...   
1491     0           0         0         0           0           0          0   
1492     0           0         0         0           0           0          0   
1493     0           0         0         0           0           0          0   
1494     0           0         0         0           0           0          0   
1495     0           0         0         0           0           0          0   

      afford

In [48]:
# Kalkulasi cosine similarity corpus yang kita punya
similarities = calculate_cosine_similarity(matrix, query_vector)

# print cosine similarity
data = {'Document': corpus, 'Similarity': similarities[0]}
df = pd.DataFrame(data)

print(f"{query}")
print(df)

this tv is really good, i like the amoled display.
         Document  Similarity
0           watch    0.000000
1          really    0.447214
2            good    0.447214
3      durability    0.000000
4         battery    0.000000
...           ...         ...
1491         pack    0.000000
1492      charges    0.000000
1493      quickly    0.000000
1494  efficiently    0.000000
1495         love    0.000000

[1496 rows x 2 columns]


# Word-Embedding

In [50]:
import pandas as pd
import sklearn as sk
import math

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [67]:
# tokenisasi
total = set()
for i in range(1, len(sent_corpus)):
    first_sentence = word_tokenize(sent_corpus[i-1])
    second_sentence = word_tokenize(sent_corpus[i])
    total.update(set(first_sentence).union(set(second_sentence)))
    
# hitung frekuensi kemunculan kata (pake hashmap)
wordDict = []
for i in range(1, len(sent_corpus)):
    wordDict_temp = dict.fromkeys(total, 0)
    for word in first_sentence:
        wordDict_temp[word] += 1
    wordDict.append(wordDict_temp)

pd.DataFrame(wordDict)

Unnamed: 0,photo,hand,straps,have,on,stunning,to,like,notifications,accurately,...,",",seconds,gaming,with,navigates,satisfied,cleaning,hub,all,fun
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
265,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
266,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
267,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count / float(corpusCount)
    return tfDict

tf_data = []
for i in range(len(wordDict)):
    tf_data.append(computeTF(wordDict[i], sent_corpus[i]))

tf = pd.DataFrame(tf_data)
tf

Unnamed: 0,photo,hand,straps,have,on,stunning,to,like,notifications,accurately,...,",",seconds,gaming,with,navigates,satisfied,cleaning,hub,all,fun
0,0.0,0.0,0.025000,0.0,0.025000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.021277,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.023256,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.019608,0.0,0.019608,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.027027,0.0,0.027027,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,0.0,0.0,0.011494,0.0,0.011494,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
265,0.0,0.0,0.011905,0.0,0.011905,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266,0.0,0.0,0.012346,0.0,0.012346,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267,0.0,0.0,0.012658,0.0,0.012658,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)

    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1)) # Representasi rumus IDF ada disini semua
    
    idf_df = pd.DataFrame(list(idfDict.items()), columns=['Term', 'IDF'])
    return idf_df

idf = computeIDF(wordDict)
idf

Unnamed: 0,Term,IDF
0,photo,2.429752
1,hand,2.429752
2,straps,2.429752
3,have,2.429752
4,on,2.429752
...,...,...
484,satisfied,2.429752
485,cleaning,2.429752
486,hub,2.429752
487,all,2.429752


In [85]:
def computeTF_IDF(tfBoW, idfs):
    tfidf = {}
    for word, val in tfBoW.items():
        tfidf[word] = val * idfs.loc[idfs['Term'] == word, 'IDF'].iloc[0]
    return tfidf

tfidf_data = []
for i in range(len(tf_data)):
    tfidf_data.append(computeTF_IDF(tf_data[i], idf))

tfidf = pd.DataFrame(tfidf_data)
tfidf

Unnamed: 0,photo,hand,straps,have,on,stunning,to,like,notifications,accurately,...,",",seconds,gaming,with,navigates,satisfied,cleaning,hub,all,fun
0,0.0,0.0,0.060744,0.0,0.060744,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.051697,0.0,0.051697,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.056506,0.0,0.056506,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.047642,0.0,0.047642,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.065669,0.0,0.065669,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,0.0,0.0,0.027928,0.0,0.027928,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
265,0.0,0.0,0.028926,0.0,0.028926,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266,0.0,0.0,0.029997,0.0,0.029997,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267,0.0,0.0,0.030756,0.0,0.030756,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Entity Extraction

In [49]:
import spacy

electronic_device_keywords = [
    'laptop', 'phone', 'tablet', 'camera', 'headphones', 'smartphone', 'router', 
    'printer', 'monitor', 'smartwatch', 'console', 'speaker', 'TV', 'television',
    'VR headset', 'keyboard', 'mouse', 'charger', 'drone', 'projector'
]

def extract_named_entities_and_parsed_trees(sentence):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sentence)
    named_entities = {
        "persons": [ent.text for ent in doc.ents if ent.label_ == 'PERSON'],
        "organizations": [ent.text for ent in doc.ents if ent.label_ == 'ORG'],
        "locations": [ent.text for ent in doc.ents if ent.label_ == 'GPE'],
        "electronic_devices": [ent.text for ent in doc if ent.text.lower() in electronic_device_keywords]
    }

    print("Formatted Dependency Parse Tree:")
    for token in doc:
        print(f"{token.text} <--{token.dep_}-- {token.head.text} ({token.pos_})")
    return named_entities

def main():
    sentence = cleaned_corpus
    named_entities = extract_named_entities_and_parsed_trees(sentence)

    print("\nExtracted Named Entities:")
    print("Persons: ", named_entities["persons"])
    print("Organizations: ", named_entities["organizations"])
    print("Locations: ", named_entities["locations"])
    print("Electronic Devices: ", named_entities["electronic_devices"])

if __name__ == "__main__":
    main()

Formatted Dependency Parse Tree:
watch <--ccomp-- feels (VERB)
really <--advmod-- good (ADV)
good <--amod-- life (ADJ)
durability <--compound-- battery (NOUN)
battery <--compound-- life (NOUN)
life <--nsubj-- smartphone (NOUN)
smartphone <--ccomp-- watch (NOUN)
amazing <--amod-- clarity (ADJ)
love <--compound-- clarity (NOUN)
clarity <--compound-- screen (NOUN)
laptop <--compound-- screen (NOUN)
screen <--dobj-- smartphone (NOUN)
sound <--compound-- quality (NOUN)
quality <--compound-- headphones (NOUN)
headphones <--conj-- smartphone (NOUN)
fantastic <--amod-- captures (ADJ)
camera <--compound-- captures (NOUN)
captures <--conj-- smartphone (NOUN)
stunning <--amod-- photos (ADJ)
photos <--nsubj-- build (NOUN)
build <--ccomp-- watch (VERB)
quality <--nmod-- smartwatch (NOUN)
tablet <--nmod-- smartwatch (NOUN)
excellent <--amod-- speed (ADJ)
speed <--compound-- performance (NOUN)
performance <--nmod-- smartwatch (NOUN)
gaming <--nmod-- smartwatch (NOUN)
console <--nmod-- smartwatch (NOU