# Tugas 2 PPW : Pre Processing Data PTA Trunojoyo

In [None]:
import numpy as np
import pandas as pd
import pickle

## Import Data

In [None]:
# df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataPTAInformatika.csv')
# df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataPTAInformatikaMini.csv')
df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataPTAInformatikaLabel.csv',delimiter=';')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

## Pre Processing Data

### Cek Data yang Kosong

In [None]:
df.isnull().sum()

Judul             6
Nama Penulis      0
Pembimbing I      0
Pembimbing II    12
Abstrak          29
Prodi             5
Label             7
dtype: int64

### Menghapus Data yang Kosong

In [None]:
df.dropna(inplace=True)

### Cleaning Data

#### Membuat Fungsi Cleaning Data
- Tag HTML
- LowerCase Data
- Spasi pada teks
- Tanda baca dan karakter spesial
- Nomor
- Komponen Lainnya

In [None]:
import re, string

# Text Cleaning
def cleaning(text):
    # Menghapus tag HTML
    text = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});').sub('', str(text))

    # Mengubah seluruh teks menjadi huruf kecil
    text = text.lower()

    # Menghapus spasi pada teks
    text = text.strip()

    # Menghapus Tanda Baca, karakter spesial, and spasi ganda
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub("â", "", text)

    # Menghapus Nomor
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Mengubah text yang berisi 'nan' dengan whitespace agar nantinya dapat dihapus
    text = re.sub('nan', '', text)

    return text

#### Implementasi Fungsi Pada Data Frame Abstrak

In [None]:
df['Abstrak'] = df['Abstrak'].apply(lambda x: cleaning(x))
df.head()

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",gerak pekerja ada pada game yang memiliki genr...,Jurusan Teknik Informatika,RPL
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.",perkembangan game yang semakin pesat memberika...,Jurusan Teknik Informatika,RPL
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",sistem pengenalan wajah adalah suatu sistem un...,Jurusan Teknik Informatika,Kecerdasan Komputasional
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",kantor badan kepegawaian kota bangkalan adalah...,Jurusan Teknik Informatika,RPL


### Tokenisasi Data
Memisahkan sebuah Dokumen menjadi susunan per kata / term

#### Import Library NLTK

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

#### Implementasi Library pada Data

In [None]:
df['abstrak_tokens'] = df['Abstrak'].apply(lambda x: word_tokenize(x))
df[["Abstrak", "abstrak_tokens"]].head()

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak pekerja ada pada game yang memiliki genr...,"[gerak, pekerja, ada, pada, game, yang, memili..."
1,perkembangan game yang semakin pesat memberika...,"[perkembangan, game, yang, semakin, pesat, mem..."
2,sistem pengenalan wajah adalah suatu sistem un...,"[sistem, pengenalan, wajah, adalah, suatu, sis..."
3,teknologi mobile game beroperating system open...,"[teknologi, mobile, game, beroperating, system..."
4,kantor badan kepegawaian kota bangkalan adalah...,"[kantor, badan, kepegawaian, kota, bangkalan, ..."


In [None]:
# df.to_csv('DataTokenisasi.csv', index=False)

#### Ekspor Model

### Stopword Data
Mengubah isi dokumen sesuai dengan kamus data

#### Import Library NLTK

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Implementasi Library pada Data

In [None]:
from nltk.corpus import stopwords
from itertools import chain

stop_words = set(chain(stopwords.words('indonesian')))
df['abstrak_tokens'] = df['abstrak_tokens'].apply(lambda x: [w for w in x if not w in stop_words])

In [None]:
df['Abstrak'] = df['abstrak_tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
df[["Abstrak", "abstrak_tokens"]].head()

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak pekerja game memiliki genre rts real tim...,"[gerak, pekerja, game, memiliki, genre, rts, r..."
1,perkembangan game pesat alternative peminatnya...,"[perkembangan, game, pesat, alternative, pemin..."
2,sistem pengenalan wajah sistem mengenali ident...,"[sistem, pengenalan, wajah, sistem, mengenali,..."
3,teknologi mobile game beroperating system open...,"[teknologi, mobile, game, beroperating, system..."
4,kantor badan kepegawaian kota bangkalan instan...,"[kantor, badan, kepegawaian, kota, bangkalan, ..."


In [None]:
# df.to_csv('DataStopWord.csv', index=False)

### Steaming Data
Mengubah kata menjadi bentuk dasar

#### Import Library Sastrawi

In [None]:
!pip install sastrawi



#### Implementasi Library pada Data

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm.auto import tqdm
tqdm.pandas()

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
df['abstrak_tokens'] = df['abstrak_tokens'].progress_apply(lambda x: stemmer.stem(' '.join(x)).split(' '))

  0%|          | 0/818 [00:00<?, ?it/s]

In [None]:
df['Abstrak'] = df['abstrak_tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
df['Abstrak']

0      gerak kerja game milik genre rts real time str...
1      kembang game pesat alternative minat bentuk ga...
2      sistem kenal wajah sistem nali identitas wajah...
3      teknologi mobile game beroperating system open...
4      kantor badan pegawai kota bangkal instansi per...
                             ...                        
848    investasi saham milik resiko rugi dikarenakanp...
849    information retrieval ir ambil informasi simpa...
850    klasifikasi citra proses kelompok piksel citra...
851    identifikasi atribut pejal kaki salah teliti k...
852    topik deteksi objek tarik perhati kembang tekn...
Name: Abstrak, Length: 818, dtype: object

In [None]:
# df.to_csv('DataSteaming.csv', index=False)

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataSteaming.csv')
df

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label,abstrak_tokens
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",gerak kerja game milik genre rts real time str...,Jurusan Teknik Informatika,RPL,"['gerak', 'kerja', 'game', 'milik', 'genre', '..."
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.",kembang game pesat alternative minat bentuk ga...,Jurusan Teknik Informatika,RPL,"['kembang', 'game', 'pesat', 'alternative', 'm..."
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",sistem kenal wajah sistem nali identitas wajah...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"['sistem', 'kenal', 'wajah', 'sistem', 'nali',..."
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL,"['teknologi', 'mobile', 'game', 'beroperating'..."
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",kantor badan pegawai kota bangkal instansi per...,Jurusan Teknik Informatika,RPL,"['kantor', 'badan', 'pegawai', 'kota', 'bangka..."
...,...,...,...,...,...,...,...,...
813,PENERAPAN ALGORITMA LONG-SHORT TERM MEMORY UNT...,Rachmad Agung Pambudi,"Eka Mala Sari Rochman, S.Kom., M.Kom","Sri Herawati, S.Kom., M.Kom",investasi saham milik resiko rugi dikarenakanp...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"['investasi', 'saham', 'milik', 'resiko', 'rug..."
814,SISTEM PENCARIAN TEKS AL-QURAN TERJEMAHAN BERB...,Nadila Hidayanti,"Achmad Jauhari, S.T., M.Kom","Ika Oktavia Suzanti, S.Kom., M.Cs",information retrieval ir ambil informasi simpa...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"['information', 'retrieval', 'ir', 'ambil', 'i..."
815,KLASIFIKASI KOMPLEKSITAS VISUAL CITRA SAMPAH M...,Afni Sakinah,"Dr. Indah Agustien Siradjuddin, S.Kom., M.Kom.","Moch. Kautsar Sophan, S.Kom., M.MT.",klasifikasi citra proses kelompok piksel citra...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"['klasifikasi', 'citra', 'proses', 'kelompok',..."
816,IDENTIFIKASI BINER ATRIBUT PEJALAN KAKI MENGGU...,Friska Fatmawatiningrum,"Dr. Indah Agustien Siradjuddin, S.Kom., M.Kom.","Prof. Dr. Arief Muntasa, S.Si., M.MT.",identifikasi atribut pejal kaki salah teliti k...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"['identifikasi', 'atribut', 'pejal', 'kaki', '..."


In [None]:
df['Abstrak']

0      gerak kerja game milik genre rts real time str...
1      kembang game pesat alternative minat bentuk ga...
2      sistem kenal wajah sistem nali identitas wajah...
3      teknologi mobile game beroperating system open...
4      kantor badan pegawai kota bangkal instansi per...
                             ...                        
813    investasi saham milik resiko rugi dikarenakanp...
814    information retrieval ir ambil informasi simpa...
815    klasifikasi citra proses kelompok piksel citra...
816    identifikasi atribut pejal kaki salah teliti k...
817    topik deteksi objek tarik perhati kembang tekn...
Name: Abstrak, Length: 818, dtype: object

## Ekstraksi Fitur

### Term Frekuensi

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(df['Abstrak'].values.astype('U'))

terms_count = count_vectorizer.get_feature_names_out()
df_countvect = pd.DataFrame(data = X_count.toarray(),columns = terms_count)
df_countvect.shape

(818, 6367)

In [None]:
terms_count

array(['aalysis', 'abad', 'abadi', ..., 'zoom', 'zucara', 'zungu'],
      dtype=object)

In [None]:
token_counts = df_countvect.sum(axis=0)

non_zero_token_counts = token_counts[token_counts != 0]

print("Token Counts yang Tidak Sama dengan 0:")
print(non_zero_token_counts)

Token Counts yang Tidak Sama dengan 0:
aalysis    1
abad       1
abadi      2
abai       1
abdi       3
          ..
zone       3
zoning     4
zoom       3
zucara     1
zungu      1
Length: 6367, dtype: int64


In [None]:
df_countvect.to_csv('Data_CountVectorize.csv', index=False)

In [None]:
with open("count_vectorizer_model.pkl", "wb") as file:
    pickle.dump(count_vectorizer, file)

### One Hot Encoding

In [None]:
df_countvect1 = df_countvect.apply(pd.to_numeric, errors='coerce')
df_binary = df_countvect1.applymap(lambda x: 1 if x > 0 else 0)
df_binary

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
816,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_binary.to_csv('Data_OneHotEncoder.csv', index=False)

In [None]:
with open("df_binary_model.pkl", "wb") as file:
    pickle.dump(df_binary, file)

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
# X_tfidf = vectorizer.fit_transform(df['Abstrak'].tolist())
X_tfidf = vectorizer.fit_transform((df['Abstrak'].values.astype('U').tolist()))

terms = vectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data = X_tfidf.toarray(),columns = terms)
df_tfidfvect

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X_tfidf.shape

(818, 6367)

In [None]:
df_tfidfvect.to_csv('Data_TF-IDF.csv', index=False)

#### Bobot Kata dalam setiap Dokumen

In [None]:
df_tfidf = pd.DataFrame(X_tfidf.todense().T,
                        index = terms,
                        columns = [f'D{i+1}' for i in range(len(df['Abstrak']))])
df_tfidf

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D809,D810,D811,D812,D813,D814,D815,D816,D817,D818
aalysis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abadi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abai,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abdi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoom,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zucara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
token_counts = df_tfidf.sum(axis=0)

non_zero_token_counts = token_counts[token_counts != 0]

print("Token Counts yang Tidak Sama dengan 0:")
print(non_zero_token_counts)

Token Counts yang Tidak Sama dengan 0:
D1      5.032601
D2      4.608430
D3      5.086846
D4      4.777752
D5      5.384437
          ...   
D814    6.996537
D815    6.429370
D816    6.874168
D817    5.764233
D818    6.831023
Length: 818, dtype: float64


In [None]:
with open("tfidf_vectorizer_model.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

### Log Frekuensi

In [None]:
df_countvect1 = df_countvect.apply(pd.to_numeric, errors='coerce')
df_log = df_countvect1.applymap(lambda x: np.log1p(x) if x > 0 else 0)
df_log

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_log.to_csv('Data_LogFrekuensi.csv', index=False)

In [None]:
with open("df_log_model.pkl", "wb") as file:
    pickle.dump(df_log, file)

## Skip Gram Data

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
import pandas as pd

# sentences = df['abstrak_tokens'].tolist()
sentences = df['abstrak_tokens']

# model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)
model_word2vec = Word2Vec.load("word2vec_model")

word = "gerak"
similar_words = model_word2vec.wv.most_similar(word)

print(f"Kata yang mirip dengan '{word}':")
for w, sim in similar_words:
    print(f"{w}: {sim:.4f}")

Kata yang mirip dengan 'gerak':
main: 0.9476
bas: 0.9467
learning: 0.9446
pilih: 0.9444
kunci: 0.9443
media: 0.9442
milik: 0.9438
sesuai: 0.9438
indonesia: 0.9436
teknologi: 0.9435


In [None]:
# model.save("word2vec_model")

In [None]:
import gensim
from gensim.models import Word2Vec
import pandas as pd

sentences = df['abstrak_tokens'].tolist()

# model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)
model_word2vec = Word2Vec.load("word2vec_model")

word = "gerak"
context_words = []

if word in model_word2vec.wv:
    word_vector = model_word2vec.wv[word]
    similar_words = model_word2vec.wv.most_similar([word_vector], topn=3)
    context_words = [w for w, _ in similar_words]

print(f"Kata-kata dalam konteks window=1 untuk '{word}':")
for w in context_words:
    print(w)


Kata-kata dalam konteks window=1 untuk 'gerak':
gerak
main
bas


In [None]:
import gensim
from gensim.models import Word2Vec

# model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)
model_word2vec = Word2Vec.load("word2vec_model")

word1 = "gerak"
word2 = "sistem"

if word1 in model_word2vec.wv and word2 in model_word2vec.wv:
    vector1 = model_word2vec.wv[word1]
    vector2 = model_word2vec.wv[word2]
    similarity = model_word2vec.wv.cosine_similarities(vector1, [vector2])[0]

    print(f"Kesamaan kosakata antara '{word1}' dan '{word2}': {similarity:.4f}")
else:
    print("Salah satu atau kedua kata tidak ada dalam model.")


Kesamaan kosakata antara 'gerak' dan 'sistem': 0.9417


# Tugas 3 PPW : LDA Modeling

## LDA Modeling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

### Modeling Data
- k=3
- alpha=0.1
- betha=0.2



In [None]:
lda_model = LatentDirichletAllocation(n_components=100, doc_topic_prior=0.2, topic_word_prior=0.1, random_state=42)
w1 = lda_model.fit_transform(df_countvect)
h1 = lda_model.components_

In [None]:
import pickle
with open("lda_model100.pkl", "wb") as file:
    pickle.dump(lda_model, file)

In [None]:
w1

array([[0.00202476, 0.00202258, 0.0020202 , ..., 0.00202758, 0.00205153,
        0.00202921],
       [0.00183549, 0.0018404 , 0.00183486, ..., 0.00184194, 0.0018559 ,
        0.00183637],
       [0.00147103, 0.00905917, 0.00147059, ..., 0.00148173, 0.00150423,
        0.00147504],
       ...,
       [0.00115606, 0.00113776, 0.00113711, ..., 0.0011413 , 0.00115063,
        0.00113897],
       [0.00136091, 0.00136382, 0.0013606 , ..., 0.00136435, 0.00140549,
        0.00136308],
       [0.00183679, 0.00183967, 0.00183487, ..., 0.00185296, 0.00191061,
        0.00184806]])

### proporsi topik pada dokumen

In [None]:
colnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
docnames = ["Doc" + str(i) for i in range(len(df['Abstrak']))]
df_doc_topic = pd.DataFrame(np.round(w1,2),columns=colnames,index=docnames)
df_doc_topic['label'] = df['Label'].values

In [None]:
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic91,Topic92,Topic93,Topic94,Topic95,Topic96,Topic97,Topic98,Topic99,label
Doc0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,RPL
Doc1,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,RPL
Doc2,0.0,0.01,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,Kecerdasan Komputasional
Doc3,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.19,...,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.00,0.0,RPL
Doc4,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.27,0.0,RPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc813,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,Kecerdasan Komputasional
Doc814,0.0,0.00,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.00,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.55,0.0,Kecerdasan Komputasional
Doc815,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,Kecerdasan Komputasional
Doc816,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,Kecerdasan Komputasional


In [None]:
doc_topic_proportions = lda_model.transform(X_count)

for i, doc in enumerate(df['Abstrak']):
    print(f"Dokumen {i+1}:")
    for j, topic_prob in enumerate(doc_topic_proportions[i]):
        print(f"Topik {j+1}: {topic_prob:.4f}")
    print()




Dokumen 1:
Topik 1: 0.4034
Topik 2: 0.0025
Topik 3: 0.5940

Dokumen 2:
Topik 1: 0.0023
Topik 2: 0.0023
Topik 3: 0.9955

Dokumen 3:
Topik 1: 0.0017
Topik 2: 0.9965
Topik 3: 0.0017

Dokumen 4:
Topik 1: 0.0027
Topik 2: 0.0026
Topik 3: 0.9947

Dokumen 5:
Topik 1: 0.3711
Topik 2: 0.1780
Topik 3: 0.4510

Dokumen 6:
Topik 1: 0.3291
Topik 2: 0.0018
Topik 3: 0.6691

Dokumen 7:
Topik 1: 0.8697
Topik 2: 0.0018
Topik 3: 0.1286

Dokumen 8:
Topik 1: 0.5077
Topik 2: 0.4898
Topik 3: 0.0025

Dokumen 9:
Topik 1: 0.2746
Topik 2: 0.6471
Topik 3: 0.0782

Dokumen 10:
Topik 1: 0.4008
Topik 2: 0.5974
Topik 3: 0.0018

Dokumen 11:
Topik 1: 0.1887
Topik 2: 0.8092
Topik 3: 0.0022

Dokumen 12:
Topik 1: 0.0016
Topik 2: 0.9968
Topik 3: 0.0016

Dokumen 13:
Topik 1: 0.0923
Topik 2: 0.9059
Topik 3: 0.0018

Dokumen 14:
Topik 1: 0.4776
Topik 2: 0.5208
Topik 3: 0.0016

Dokumen 15:
Topik 1: 0.9968
Topik 2: 0.0016
Topik 3: 0.0016

Dokumen 16:
Topik 1: 0.9737
Topik 2: 0.0245
Topik 3: 0.0018

Dokumen 17:
Topik 1: 0.0847
Topik

In [None]:
topic_word_distributions = lda_model.components_

feature_names = count_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(topic_word_distributions):
    top_words_idx = topic.argsort()[::-1][:10]  # Ambil 10 kata teratas
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topik {topic_idx}:")
    print(", ".join(top_words))
    print()

Topik 0:
metode, dosen, berita, rumah, kost, berkas, raket, plagiasi, sesuai, hasil

Topik 1:
jual, amal, barang, stok, toko, metode, data, hasil, beli, kec

Topik 2:
jurus, profil, preeklamsi, hamil, matching, besar, mati, survei, ringan, tim

Topik 3:
sakit, gejala, sistem, pakar, dokter, pasien, darah, derita, diagnosa, demam

Topik 4:
wajah, kenal, citra, metode, teliti, klasifikasi, hasil, coba, akurasi, fitur

Topik 5:
citra, cari, fitur, query, warna, mirip, tekstur, batik, ekstraksi, sistem

Topik 6:
air, nilai, sistem, hasil, garam, data, evaluasi, salah, tingkat, hilang

Topik 7:
voli, promosi, nan, media, elektronik, tukar, produk, tes, usaha, online

Topik 8:
ukm, usaha, ikm, tengah, metode, bangkal, hasil, hadits, tingkat, saing

Topik 9:
ajar, aplikasi, bahasa, media, anak, siswa, hasil, madura, learning, kembang

Topik 10:
nilai, jari, sidik, deteksi, algoritma, plagiarisme, hasil, mahasiswa, dokumen, sistem

Topik 11:
pasien, menu, kalori, diet, hari, sedia, harris, ben

### Proporsi kata pada topik

In [None]:
label = []
for i in range(1,(lda_model.components_.shape[1]+1)):
  masukan = df_countvect.columns[i-1]
  label.append(masukan)
df_topic_word = pd.DataFrame(lda_model.components_,columns = label)
df_topic_word.rename(index={0:"Topik 1",1:"Topik 2",2:"Topik 3"}).transpose()

Unnamed: 0,Topik 1,Topik 2,Topik 3,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
aalysis,0.100000,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,...,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,0.100110,0.100000
abad,0.100000,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,...,0.1,0.1,0.1,0.1,0.1,0.1,1.099991,0.100000,0.100005,0.100000
abadi,0.100000,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,...,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,0.100000,2.099998
abai,0.100000,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,...,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,0.100000,0.100000
abdi,1.100698,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100001,...,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,0.100003,0.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zone,0.100000,0.1,0.1,0.1,0.1,0.1,0.1,0.1,3.099987,0.100000,...,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,0.100000,0.100000
zoning,0.100000,0.1,0.1,0.1,0.1,0.1,0.1,0.1,4.099987,0.100000,...,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100000,0.100000,0.100000
zoom,0.100000,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100006,0.100013,...,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100016,0.100000,0.100000
zucara,0.100000,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,0.100007,...,0.1,0.1,0.1,0.1,0.1,0.1,0.100000,1.099975,0.100000,0.100000


In [None]:
df_countvect['labels'] = df['Label'].values
df_countvect.shape

(818, 6368)

## Training Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import time

X = df_doc_topic.drop('label', axis=1)
y = df_doc_topic['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_2 = df_countvect.drop('labels', axis=1)
y_2 = df_countvect['labels']

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42)

### Naive Bayes

In [None]:
start_time = time.time()
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print("Waktu pelatihan model: {:.2f} detik".format(training_time))

Waktu pelatihan model: 0.01 detik


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = naive_bayes_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Akurasi:", accuracy)

print("Laporan Klasifikasi:")
print(classification_report(y_test, y_pred))

Akurasi: 0.7926829268292683
Laporan Klasifikasi:
                          precision    recall  f1-score   support

Kecerdasan Komputasional       0.80      0.98      0.88       124
                     RPL       0.75      0.24      0.36        38
kecerdasan Komputasional       0.00      0.00      0.00         2

                accuracy                           0.79       164
               macro avg       0.52      0.40      0.41       164
            weighted avg       0.78      0.79      0.75       164



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
with open("naive_bayes_model.pkl", "wb") as file:
    pickle.dump(naive_bayes_classifier, file)

In [None]:
start_time = time.time()
naive_bayes_classifier_2 = MultinomialNB()
naive_bayes_classifier_2.fit(X_train_2, y_train_2)
end_time = time.time()
training_time = end_time - start_time
print("Waktu pelatihan model: {:.2f} detik".format(training_time))

Waktu pelatihan model: 0.11 detik


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred_2 = naive_bayes_classifier_2.predict(X_test_2)

accuracy = accuracy_score(y_test_2, y_pred_2)
print("Akurasi:", accuracy)

print("Laporan Klasifikasi:")
print(classification_report(y_test_2, y_pred_2))

Akurasi: 0.823170731707317
Laporan Klasifikasi:
                          precision    recall  f1-score   support

Kecerdasan Komputasional       0.91      0.86      0.88       124
                     RPL       0.62      0.74      0.67        38
kecerdasan Komputasional       0.00      0.00      0.00         2

                accuracy                           0.82       164
               macro avg       0.51      0.53      0.52       164
            weighted avg       0.83      0.82      0.82       164



In [None]:
with open("naive_bayes_model2.pkl", "wb") as file:
    pickle.dump(naive_bayes_classifier_2, file)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
start_time = time.time()
neigh = KNeighborsClassifier(n_neighbors=3)
knn = neigh.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print("Waktu pelatihan model: {:.2f} detik".format(training_time))

Waktu pelatihan model: 0.01 detik


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred_knn = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_knn)
print("Akurasi:", accuracy)

print("Laporan Klasifikasi:")
print(classification_report(y_test, y_pred_knn))

Akurasi: 0.7439024390243902
Laporan Klasifikasi:
                          precision    recall  f1-score   support

Kecerdasan Komputasional       0.85      0.81      0.83       124
                     RPL       0.48      0.58      0.52        38
kecerdasan Komputasional       0.00      0.00      0.00         2

                accuracy                           0.74       164
               macro avg       0.44      0.46      0.45       164
            weighted avg       0.75      0.74      0.75       164



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
start_time = time.time()
neigh_2 = KNeighborsClassifier(n_neighbors=3)
knn_2 = neigh_2.fit(X_train_2, y_train_2)
end_time = time.time()
training_time = end_time - start_time
print("Waktu pelatihan model: {:.2f} detik".format(training_time))

Waktu pelatihan model: 0.04 detik


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred_knn_2 = knn_2.predict(X_test_2)

accuracy = accuracy_score(y_test_2, y_pred_knn_2)
print("Akurasi:", accuracy)

print("Laporan Klasifikasi:")
print(classification_report(y_test_2, y_pred_knn_2))

Akurasi: 0.7926829268292683
Laporan Klasifikasi:
                          precision    recall  f1-score   support

Kecerdasan Komputasional       0.80      0.96      0.87       124
                     RPL       0.69      0.29      0.41        38
kecerdasan Komputasional       0.00      0.00      0.00         2

                accuracy                           0.79       164
               macro avg       0.50      0.42      0.43       164
            weighted avg       0.77      0.79      0.76       164



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
with open("knn_model.pkl", "wb") as file:
    pickle.dump(neigh, file)

### Decision Tree

In [None]:
from sklearn import tree

In [None]:
start_time = time.time()
clf = tree.DecisionTreeClassifier()
decision_tree = clf.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print("Waktu pelatihan model: {:.2f} detik".format(training_time))

Waktu pelatihan model: 0.02 detik


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred_clf = decision_tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_clf)
print("Akurasi:", accuracy)

print("Laporan Klasifikasi:")
print(classification_report(y_test, y_pred_clf))

Akurasi: 0.7378048780487805
Laporan Klasifikasi:
                          precision    recall  f1-score   support

Kecerdasan Komputasional       0.84      0.81      0.82       124
                     RPL       0.47      0.55      0.51        38
kecerdasan Komputasional       0.00      0.00      0.00         2

                accuracy                           0.74       164
               macro avg       0.44      0.45      0.44       164
            weighted avg       0.74      0.74      0.74       164



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
start_time = time.time()
clf_2 = tree.DecisionTreeClassifier()
decision_tree_2 = clf_2.fit(X_train_2, y_train_2)
end_time = time.time()
training_time = end_time - start_time
print("Waktu pelatihan model: {:.2f} detik".format(training_time))

Waktu pelatihan model: 0.37 detik


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred_clf_2 = decision_tree_2.predict(X_test_2)

accuracy = accuracy_score(y_test_2, y_pred_clf_2)
print("Akurasi:", accuracy)

print("Laporan Klasifikasi:")
print(classification_report(y_test_2, y_pred_clf_2))

Akurasi: 0.823170731707317
Laporan Klasifikasi:
                          precision    recall  f1-score   support

Kecerdasan Komputasional       0.89      0.87      0.88       124
                     RPL       0.62      0.68      0.65        38
kecerdasan Komputasional       1.00      0.50      0.67         2

                accuracy                           0.82       164
               macro avg       0.84      0.69      0.73       164
            weighted avg       0.83      0.82      0.83       164



In [None]:
with open("tree_model.pkl", "wb") as file:
    pickle.dump(clf, file)

## Deteksi Data Baru

### Input Data

In [None]:
new_data = ['Gerak pekerja ada pada game yang memiliki genre RTS (Real-Time Strategy). Gerak pekerja memiliki berbagai macam gerak. Oleh sebab itu dibutuhkan sebuah pendekatan konsep AI  untuk mendesain perilaku pekerja tersebut. Perilaku karakter tersebut harus ditambahi dengan AI (Artifical intelegent) agar perilakunya menjadi lebih hidup dan realistis.Dalam penelitian ini AI yang digunakan adalah Finite State Machine. Finite State Machine digunakan untuk menentukan gerak pekerja terhadap parameter-parameter yang digunakan sebagai dasar gerak yang akan dilakukan . Selanjutnya akan disimulasikan pada game RTS dengan menggunakan game engine.Hasil yang di peroleh dalam penelitian ini adalah penerapan metode Finite State machine untuk menentukan gerak pekerja berdasarkan parameter jumlah harta, prajurit, kondisi bangunan, dan stockpile (jumlah resources yang di bawa). Kata kunci : Game, Real-Time Strategy, Gerak Pekerja, Finite State Machine.']
df_baru = pd.DataFrame({'Abstrak': new_data})
df_baru['Abstrak'] = df_baru['Abstrak'].apply(lambda x: cleaning(x))
df_baru['abstrak_tokens'] = df_baru['Abstrak'].apply(lambda x: word_tokenize(x))
df_baru['abstrak_tokens'] = df_baru['abstrak_tokens'].apply(lambda x: [w for w in x if not w in stop_words])
df_baru['abstrak_tokens'] = df_baru['abstrak_tokens'].progress_apply(lambda x: stemmer.stem(' '.join(x)).split(' '))
df_baru['Abstrak'] = df_baru['abstrak_tokens'].apply(lambda tokens: ' '.join(tokens))
df_baru[["Abstrak", "abstrak_tokens"]].head()

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak kerja game milik genre rts real time str...,"[gerak, kerja, game, milik, genre, rts, real, ..."


### TF-IDF Data baru

In [None]:
X_tfidf_baru = vectorizer.transform((df_baru['Abstrak']))

terms_baru = vectorizer.get_feature_names_out()
df_tfidfvect_baru = pd.DataFrame(data = X_tfidf_baru.toarray(),columns = terms_baru)
df_tfidfvect_baru

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Count Data Baru

In [None]:
X_count_baru = count_vectorizer.transform(np.array(df_baru['Abstrak']))

terms_count = count_vectorizer.get_feature_names_out()
df_countvect_baru = pd.DataFrame(data = X_count_baru.toarray(),columns = terms_count)
df_countvect_baru

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Topic Modeling

In [None]:
w1_baru = lda_model.fit_transform(df_countvect_baru)
h1_baru = lda_model.components_

In [None]:
w1_baru

array([[0.99497487, 0.00251256, 0.00251256]])

In [None]:
colnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
docnames = ["Doc" + str(i) for i in range(len(df_baru['Abstrak']))]
df_doc_topic_baru = pd.DataFrame(np.round(w1_baru,2),columns=colnames,index=docnames)
df_doc_topic_baru

Unnamed: 0,Topic0,Topic1,Topic2
Doc0,0.99,0.0,0.0


### Prediksi Data

#### Data topic modeling

In [None]:
y_pred_baru = decision_tree.predict(df_doc_topic_baru)

y_pred_baru

array(['RPL'], dtype=object)

#### Data TF-IDF

In [None]:
X_tfidf.shape

(818, 6367)

In [None]:
y_pred_baru_2 = decision_tree_2.predict(X_count_baru)

y_pred_baru_2



array(['RPL'], dtype=object)