# **Latent Semantic Analysis**

Topic modelling adalah salah satu teknik analisis teks pada data science yang bertujuan untuk mengidentifikasi pola atau topik-topik yang tersembunyi di dalam kumpulan dokumen atau korpus teks.

## Crawling Comment Youtube

In [None]:
!pip install selenium --quiet
!pip install selenium
!pip install webdriver_manager

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

# # selenium 4
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
browser = webdriver.Chrome(options=chrome_options)
try:
    browser.get("https://www.google.com")
    print("Page title was '{}'".format(browser.title))
finally:
    browser.quit()

Page title was 'Google'


In [None]:
#data=[]
#with Chrome(executable_path=r'C:\Program Files\chromedriver.exe') as driver:
#     wait = WebDriverWait(driver,10)
#     driver.get("https://www.youtube.com/watch?v=LOvbNCf44TE")

#     for item in range(40): 
#         wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
#         time.sleep(5)

#     for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#content"))):
#         data.append(comment.text)

In [None]:
#import pandas as pd   
#df = pd.DataFrame(data, columns=['comment'])
#df

### Preprocessing Data

In [None]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/tiarh/data/main/comment_yt.csv')
df

Unnamed: 0,comments
0,Mntp hati untk pk prabowo
1,CAWAPRES PAK JENDERAL FREDY NUMBERI
2,Ayo brabowo kami mendukung mu
3,walaupun surve menunjuk kan terendah pilihan k...
4,"Hidup pak ganjar, dari maluku Utara buat pak g..."
...,...
507,Betul bro. Jadikan pengalaman menjadi pembelaj...
508,Yang merasa sangat nyaman dengan pimpinan PDIp...
509,Anis Baswedan aja
510,Prabowo kawan❤


In [None]:
import re, string
import numpy as np

# # Text Cleaning
def cleaning(text):
#     # HTML Tag Removal

     text = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});').sub('', str(text))

#     # Case folding
     text = text.lower()

#     # Trim text
     text = text.strip()

#     # Remove punctuations, karakter spesial, and spasi ganda
     text = re.compile('<.*?>').sub('', text)
     text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
     text = re.sub('\s+', ' ', text)

#     # Number removal
     text = re.sub(r'\[[0-9]*\]', ' ', text)
     text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
     text = re.sub(r'\d', ' ', text)
     text = re.sub(r'\s+', ' ', text)

#     # Mengubah text 'nan' dengan whitespace agar nantinya dapat dihapus
     text = re.sub('nan', '', text)

     return text

In [None]:
from indoNLP.preprocessing import pipeline, replace_word_elongation, replace_slang, emoji_to_words, remove_html
pipe = pipeline([replace_word_elongation, replace_slang, emoji_to_words, remove_html])
# # steamming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
df['comments'] = df['comments'].apply(lambda x: str(x))
df['comments (clean)'] = df['comments'].apply(lambda x: pipe(x))
df['comments (clean)'] = df['comments (clean)'].apply(lambda x: cleaning(x))
# ubah empty string menjadi NaN
df['comments (clean)'] = df['comments (clean)'].replace('', np.nan)

In [None]:
print(df.isna().sum())

comments            0
comments (clean)    0
dtype: int64


In [None]:
 df.dropna(inplace=True)
 df

Unnamed: 0,comments,comments (clean)
0,Mntp hati untk pk prabowo,mntp hati untuk pakai prabowo
1,CAWAPRES PAK JENDERAL FREDY NUMBERI,cawapres pak jenderal fredy numberi
2,Ayo brabowo kami mendukung mu,ayo brabowo kami mendukung mu
3,walaupun surve menunjuk kan terendah pilihan k...,walaupun surve menunjuk kan terendah pilihan k...
4,"Hidup pak ganjar, dari maluku Utara buat pak g...",hidup pak ganjar dari maluku utara buat pak ga...
...,...,...
507,Betul bro. Jadikan pengalaman menjadi pembelaj...,betul bro jadikan pengalaman menjadi pembelaja...
508,Yang merasa sangat nyaman dengan pimpinan PDIp...,yang merasa sangat nyaman dengan pimpi pdip si...
509,Anis Baswedan aja,anis baswedan saja
510,Prabowo kawan❤,prabowo kawan


In [None]:
# to csv
#df.to_csv('datacomments_fix.csv', index=False)

## Implementasi ke Modelling

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Import data CSV

In [None]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/tiarh/data/main/datacomments_fix.csv')
df

Unnamed: 0,comments,comments (clean)
0,Mntp hati untk pk prabowo,mntp hati untuk pakai prabowo
1,CAWAPRES PAK JENDERAL FREDY NUMBERI,cawapres pak jenderal fredy numberi
2,Ayo brabowo kami mendukung mu,ayo brabowo kami mendukung mu
3,walaupun surve menunjuk kan terendah pilihan k...,walaupun surve menunjuk kan terendah pilihan k...
4,"Hidup pak ganjar, dari maluku Utara buat pak g...",hidup pak ganjar dari maluku utara buat pak ga...
...,...,...
507,Betul bro. Jadikan pengalaman menjadi pembelaj...,betul bro jadikan pengalaman menjadi pembelaja...
508,Yang merasa sangat nyaman dengan pimpinan PDIp...,yang merasa sangat nyaman dengan pimpi pdip si...
509,Anis Baswedan aja,anis baswedan saja
510,Prabowo kawan❤,prabowo kawan


In [None]:
import nltk
nltk.download('stopwords', quiet=True)

True

In [None]:
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stopwords = stopwords.words('indonesian')

# Membentuk matriks dokumen x kata
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                        stop_words=stopwords,
                        tokenizer = tokenizer.tokenize)

tfidf_matrix = vectorizer.fit_transform(df['comments (clean)'])

# Melakukan dekomposisi matriks dengan SVD
svd_model = TruncatedSVD(n_components=4)
lsa_matrix = svd_model.fit_transform(tfidf_matrix)

### Bobot Kata Terhadap Masing-Masing Topik

In [None]:
# bobot kata terhadap masing masing topik
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(svd_model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:2]
    print("Topic "+str(index)+": ",top_terms_key)

Topic 0:  [('wajah', 0.45382255778176855), ('mata', 0.42739392772533574)]
Topic 1:  [('prabowo', 0.828071764231912), ('presiden', 0.2511205982404728)]
Topic 2:  [('abal', 0.8187106654541421), ('survey', 0.3437974927931136)]
Topic 3:  [('anis', 0.8142542900047178), ('no', 0.1531599557860485)]


### Bobot Setiap Topik Terhadap Dokumen

In [None]:
# bobot setiap topik terhadap  dokumen
df_lsa = pd.DataFrame(lsa_matrix, columns=["Topik 0", "Topik 1", "Topik 2", "Topik 3"])
df_lsa = pd.concat([df["comments (clean)"], df_lsa], axis=1)
df_lsa['Topik']= df_lsa[['Topik 0', 'Topik 1', 'Topik 2', 'Topik 3']].apply(lambda x: x.argmax(), axis=1)

df_lsa

Unnamed: 0,comments (clean),Topik 0,Topik 1,Topik 2,Topik 3,Topik
0,mntp hati untuk pakai prabowo,0.023903,0.228742,-0.054547,-0.065188,1
1,cawapres pak jenderal fredy numberi,0.004516,0.002347,0.000689,0.003715,0
2,ayo brabowo kami mendukung mu,0.001776,0.014369,-0.000373,-0.002676,1
3,walaupun surve menunjuk kan terendah pilihan k...,0.034830,0.063576,0.096889,0.230379,3
4,hidup pak ganjar dari maluku utara buat pak ga...,0.021373,0.093929,-0.003769,0.055226,1
...,...,...,...,...,...,...
507,betul bro jadikan pengalaman menjadi pembelaja...,0.008636,0.007680,0.003748,0.003463,0
508,yang merasa sangat nyaman dengan pimpi pdip si...,0.027193,0.161587,-0.000628,0.101567,1
509,anis baswedan saja,0.055589,0.139648,0.130601,0.583065,3
510,prabowo kawan,0.030546,0.321189,-0.080172,-0.106377,1


In [None]:
df_lsa['Topik'].value_counts()

1    242
2    104
3     89
0     77
Name: Topik, dtype: int64