# 1. Ambil datanya (Data Collection)

In [1]:
corpus = open("positive.txt", "r").read() # ini bakal jadi corpus saya

# 2. Data Preprocessing

In [2]:
import nltk
from string import punctuation
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Tokenize the corpus
sent_corpus = sent_tokenize(corpus)
corpus = word_tokenize(corpus)

# Remove stopwords
stop_words = set(stopwords.words('english'))
corpus = [word for word in corpus if word.lower() not in stop_words]

# Remove punctuation
corpus = [word for word in corpus if word not in punctuation]

# Remove numbers
corpus = [word for word in corpus if word.isalpha()]

cleaned_corpus = " ".join(corpus)

# N-grams

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
class NGramLanguageModel:
    def __init__(self, n):
        self.n = n # represents the n in n-gram
        self.vectorizer = CountVectorizer(analyzer="word", ngram_range=(n, n)) # membuat object CountVectorizer 

    def fit_transform(self, corpus):
        return self.vectorizer.fit_transform(corpus) # menghitung frekuensi kemunculan kata dan mengubahnya menjadi vector yang dapat diolah oleh model
    
    def transform(self, corpus):
        return self.vectorizer.transform(corpus) 
    
def calculate_cosine_similarity(matrix, query_vector):
    similarities = cosine_similarity(query_vector, matrix) # menghitung cosine similarity antara matrix dan vector query.
    return similarities

In [5]:
query = input("Masukkan query: ").lower()

n = 1
# buat object dari class NGramLanguageModel
ngram_language_model = NGramLanguageModel(n)

# ubah file corpus kita ke bentuk matrix
matrix = ngram_language_model.fit_transform(corpus)
# ubah file query kita ke bentuk vector
query_vector = ngram_language_model.transform([query])

print(f"{n}-gram Model: ")
data = matrix.A

# print data corpus kita dalam bentuk dataframe
print(pd.DataFrame(data, columns=ngram_language_model.vectorizer.get_feature_names_out()))
print(query_vector.A)

1-gram Model: 
      able  absolutely  accuracy  accurate  accurately  adjustable  adjusting  \
0        0           0         0         0           0           0          0   
1        0           0         0         0           0           0          0   
2        0           0         0         0           0           0          0   
3        0           0         0         0           0           0          0   
4        0           0         0         0           0           0          0   
...    ...         ...       ...       ...         ...         ...        ...   
1491     0           0         0         0           0           0          0   
1492     0           0         0         0           0           0          0   
1493     0           0         0         0           0           0          0   
1494     0           0         0         0           0           0          0   
1495     0           0         0         0           0           0          0   

      afford

In [6]:
# Kalkulasi cosine similarity corpus yang kita punya
similarities = calculate_cosine_similarity(matrix, query_vector)

# print cosine similarity
data = {'Document': corpus, 'Similarity': similarities[0]}
df = pd.DataFrame(data)

print(f"{query}")
print(df)

this tv is good.
         Document  Similarity
0           watch    0.000000
1          really    0.000000
2            good    0.707107
3      durability    0.000000
4         battery    0.000000
...           ...         ...
1491         pack    0.000000
1492      charges    0.000000
1493      quickly    0.000000
1494  efficiently    0.000000
1495         love    0.000000

[1496 rows x 2 columns]


# Word-Embedding

# 0. Read the data

In [7]:
corpus = open("positive.txt", "r").read() # ini ceritanya bakal jadi corpus

# 1. Data Pre-processing

In [8]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

eng_stopwords = stopwords.words('english')

# Tokenize
tokens = sent_tokenize(corpus)
for i in range(len(tokens)):
    tokens[i] = word_tokenize(tokens[i].lower())

# Remove stopwords
for i in range(len(tokens)):
    tokens[i] = [word for word in tokens[i] if word not in eng_stopwords]

# Remove punctuation
for i in range(len(tokens)):
    tokens[i] = [word for word in tokens[i] if word not in punctuation]

# Unionize all words
total = set()
for i in range(len(tokens)):
    total = total.union(set(tokens[i]))

# print the data (buat diliat aja)
for i in range(len(tokens)):
    print(tokens[i])

# print the total words
print(total)

['watch', 'really', 'good', 'durability']
['battery', 'life', 'smartphone', 'amazing']
['love', 'clarity', 'laptop', "'s", 'screen']
['sound', 'quality', 'headphones', 'fantastic']
['camera', 'captures', 'stunning', 'photos']
['build', 'quality', 'tablet', 'excellent']
['speed', 'performance', 'gaming', 'console', 'top-notch']
['smartwatch', 'impressive', 'range', 'features']
['keyboard', 'laptop', 'comfortable', 'use']
['picture', 'quality', 'tv', 'outstanding']
['satisfied', 'performance', 'blender']
['wireless', 'connectivity', 'earbuds', 'seamless']
['vacuum', 'cleaner', 'efficient', 'easy', 'use']
['display', 'monitor', 'crystal', 'clear']
['gps', 'accuracy', 'fitness', 'tracker', 'spot']
['microwave', 'oven', 'heats', 'food', 'evenly', 'quickly']
['impressed', 'noise', 'cancellation', 'headphones']
['interface', 'smart', 'home', 'hub', 'user-friendly']
['battery', 'life', 'tablet', 'lasts', 'day']
['e-reader', 'lightweight', 'easy', 'hold']
['performance', 'drone', 'exceptional']

In [9]:
import pandas as pd
import sklearn as sk
import math

# Menghitung frekuensi kemunculan kata
wordDict = [] # list of dictionary
for i in range(len(tokens)):
    wordDict.append(dict.fromkeys(total, 0))
    for word in tokens[i]:
        wordDict[i][word] += 1

pd.DataFrame(wordDict)

Unnamed: 0,hands,everywhere,theater,damaging,gentle,wi-fi,intuitive,smart,customization,protector,...,takes,router,powerful,highly,bright,changer,recharges,app,coffee,offers
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
268,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count / float(corpusCount)
    return tfDict

tf = []
for i in range(len(tokens)):
    tf.append(computeTF(wordDict[i], tokens[i]))

pd.DataFrame(tf)

Unnamed: 0,hands,everywhere,theater,damaging,gentle,wi-fi,intuitive,smart,customization,protector,...,takes,router,powerful,highly,bright,changer,recharges,app,coffee,offers
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)

    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1)) # Representasi rumus IDF ada disini semua
    
    idf_df = pd.DataFrame(list(idfDict.items()), columns=['Term', 'IDF'])
    return idf_df

idfs = computeIDF(wordDict)
print(idfs)

           Term       IDF
0         hands  2.431364
1    everywhere  2.431364
2       theater  2.431364
3      damaging  2.431364
4        gentle  2.431364
..          ...       ...
440     changer  2.431364
441   recharges  2.431364
442         app  2.431364
443      coffee  2.431364
444      offers  2.431364

[445 rows x 2 columns]


In [12]:
def computeTFIDF(tfBoW, idfs):
    tfidf = {}
    for word, val in tfBoW.items():
        tfidf[word] = val * idfs.loc[idfs['Term'] == word, 'IDF'].iloc[0]
    return tfidf

tfidf = []
for i in range(len(tokens)):
    tfidf.append(computeTFIDF(tf[i], idfs))

tfidf = pd.DataFrame(tfidf).fillna(0)
print(tfidf)

     hands  everywhere  theater  damaging  gentle  wi-fi  intuitive  smart  \
0      0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
1      0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
2      0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
3      0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
4      0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
..     ...         ...      ...       ...     ...    ...        ...    ...   
265    0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
266    0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
267    0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
268    0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   
269    0.0         0.0      0.0       0.0     0.0    0.0        0.0    0.0   

     customization  protector  ...  takes  router  powerful  hi

In [13]:
import numpy as np
from numpy.linalg import norm

cosine_sim = cosine_similarity(tfidf, tfidf)

print("Cosine Similarity:")
print(cosine_sim)

Cosine Similarity:
[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.18898224]
 [0.         0.         1.         ... 0.         0.         0.16903085]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.18898224 0.16903085 ... 0.         0.         1.        ]]


In [14]:
similarity_scores = list(enumerate(cosine_sim[0]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

# Print the top 3 most similar documents
print("\nTop 3 recommendations:")
for i, score in similarity_scores[1:4]:
    print(f"Document {i}: {tokens[i]} (Score: {score})")


Top 3 recommendations:
Document 63: ['watch', 'really', 'good', 'durability'] (Score: 1.0)
Document 167: ['tv', 'stunning', 'picture', 'quality', 'itâ€™s', 'joy', 'watch'] (Score: 0.18898223650461363)
Document 1: ['battery', 'life', 'smartphone', 'amazing'] (Score: 0.0)


In [15]:
import pickle

with open('recommendation_data.pickle', 'wb') as f:
    pickle.dump((tfidf, cosine_sim), f)

# 3. Entity Extraction

In [17]:
import spacy

electronic_device_keywords = [
    'laptop', 'phone', 'tablet', 'camera', 'headphones', 'smartphone', 'router', 
    'printer', 'monitor', 'smartwatch', 'console', 'speaker', 'TV', 'television',
    'VR headset', 'keyboard', 'mouse', 'charger', 'drone', 'projector'
]

def extract_named_entities_and_parsed_trees(sentence):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sentence)
    named_entities = {
        "persons": [ent.text for ent in doc.ents if ent.label_ == 'PERSON'],
        "organizations": [ent.text for ent in doc.ents if ent.label_ == 'ORG'],
        "locations": [ent.text for ent in doc.ents if ent.label_ == 'GPE'],
        "electronic_devices": [ent.text for ent in doc if ent.text.lower() in electronic_device_keywords]
    }

    print("Formatted Dependency Parse Tree:")
    for token in doc:
        print(f"{token.text} <--{token.dep_}-- {token.head.text} ({token.pos_})")
    return named_entities

sentence = cleaned_corpus
named_entities = extract_named_entities_and_parsed_trees(sentence)

print("\nExtracted Named Entities:")
print("Persons: ", named_entities["persons"])
print("Organizations: ", named_entities["organizations"])
print("Locations: ", named_entities["locations"])
print("Electronic Devices: ", named_entities["electronic_devices"])

Formatted Dependency Parse Tree:
watch <--ccomp-- feels (VERB)
really <--advmod-- good (ADV)
good <--amod-- life (ADJ)
durability <--compound-- battery (NOUN)
battery <--compound-- life (NOUN)
life <--nsubj-- smartphone (NOUN)
smartphone <--ccomp-- watch (NOUN)
amazing <--amod-- clarity (ADJ)
love <--compound-- clarity (NOUN)
clarity <--compound-- screen (NOUN)
laptop <--compound-- screen (NOUN)
screen <--dobj-- smartphone (NOUN)
sound <--compound-- quality (NOUN)
quality <--compound-- headphones (NOUN)
headphones <--conj-- smartphone (NOUN)
fantastic <--amod-- captures (ADJ)
camera <--compound-- captures (NOUN)
captures <--conj-- smartphone (NOUN)
stunning <--amod-- photos (ADJ)
photos <--nsubj-- build (NOUN)
build <--ccomp-- watch (VERB)
quality <--nmod-- smartwatch (NOUN)
tablet <--nmod-- smartwatch (NOUN)
excellent <--amod-- speed (ADJ)
speed <--compound-- performance (NOUN)
performance <--nmod-- smartwatch (NOUN)
gaming <--nmod-- smartwatch (NOUN)
console <--nmod-- smartwatch (NOU

In [18]:
import pickle

with open("ner_data.pickle", "wb") as f:
    pickle.dump(named_entities, f)