In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
df_train = pd.read_csv('preprocessed_dataset.csv')
df_test = pd.read_csv('preprocessed_datatest.csv')

# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

#Transformasi teks ke bentuk vektor TF-IDF
tfidf_train = tfidf_vectorizer.fit_transform(df_train['Judul'])
tfidf_test = tfidf_vectorizer.transform(df_test['Judul'])

print(f"TF-IDF Matrix Data Train berhasil dibuat dengan shape: {tfidf_train.shape}")
print(f"TF-IDF Matrix Data Test berhasil dibuat dengan shape: {tfidf_test.shape}")



TF-IDF Matrix Data Train berhasil dibuat dengan shape: (118, 827)
TF-IDF Matrix Data Test berhasil dibuat dengan shape: (50, 827)


In [3]:
features = tfidf_vectorizer.get_feature_names_out()
print(features)

['16' '19' '2018' '2022' '2d' 'active' 'aerial' 'air' 'ais' 'aktif'
 'aktivasi' 'akurasi' 'akut' 'al' 'alam' 'alat' 'algorithm' 'algoritma'
 'anak' 'analisa' 'analisis' 'analitik' 'analysis' 'and' 'android' 'ann'
 'annealing' 'anomali' 'answering' 'ant' 'api' 'aplikasi' 'approach'
 'arabic' 'aritmia' 'arnold' 'arsitektur' 'artefak' 'artificial' 'arus'
 'ash' 'aspect' 'assisted' 'asymmetry' 'atribut' 'attack' 'attention'
 'aud' 'augmentasi' 'australia' 'autisme' 'autonomous' 'availability'
 'awan' 'aws' 'axis' 'b2b' 'backpropagation' 'bagian' 'bahan' 'baku'
 'balik' 'bangun' 'bank' 'barat' 'based' 'baterai' 'bawah' 'bayes'
 'bayisehatkita' 'belajar' 'benefit' 'berat' 'berbahasa' 'berbasis'
 'berdarah' 'berdasaran' 'berdasarkan' 'berita' 'bersifat' 'bert' 'besar'
 'bi' 'biasa' 'biaya' 'biomarker' 'bisnis' 'bittern' 'blade' 'block'
 'blocplan' 'body' 'boosting' 'bottom' 'boundary' 'briggs' 'bubble' 'bumi'
 'bursa' 'cabai' 'cadang' 'cair' 'captioning' 'case' 'cba' 'cbam'
 'centralized' 'ch

In [4]:
dense_tfidf = tfidf_train.todense()

tfidf_df = pd.DataFrame(dense_tfidf, columns=features)
tfidf_df.head()

Unnamed: 0,16,19,2018,2022,2d,active,aerial,air,ais,aktif,...,wi,window,with,wolf,xgboost,xyz,yang,yolo,yolov8,zoning
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.294365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Fungsi untuk cari Top-N kata dari satu dokumen
def get_top_n_words(row, n=10):
    sorted_indices = np.argsort(row)[::-1]  # Urutkan dari terbesar
    top_features = [(features[i], row[i]) for i in sorted_indices[:n]]
    return top_features

# Ambil Top 10 kata per dokumen
top_words_per_doc = []

for i, row in enumerate(tfidf_df.values):
    top_words = get_top_n_words(row, n=10)
    top_words_per_doc.append(top_words)

# Tampilkan hasil
for idx, top_words in enumerate(top_words_per_doc):
    print(f"\n📝 Dokumen {idx+1}: {df_train.loc[idx, 'Judul']}")
    for word, score in top_words:
        print(f"- {word}: {score:.4f}")


📝 Dokumen 1: Klasifikasi Aritmia Sinyal ECG Menggunakan Transformasi Wavelet Dan Analisa Statistik.
- statistik: 0.3769
- aritmia: 0.3769
- analisa: 0.3769
- ecg: 0.3769
- wavelet: 0.3469
- sinyal: 0.3469
- transformasi: 0.3469
- klasifikasi: 0.1898
- dan: 0.1369
- menggunakan: 0.1261

📝 Dokumen 2: Klasifikasi Jenis Tumor Otak Meningioma, Glioma, Dan Pituitari Berbasis Hybrid Vgg-16 Dan Svm Untuk Diagnosis Praoperasi.
- 16: 0.2944
- praoperasi: 0.2944
- vgg: 0.2944
- pituitari: 0.2944
- meningioma: 0.2944
- glioma: 0.2709
- jenis: 0.2709
- hybrid: 0.2709
- diagnosis: 0.2709
- otak: 0.2542

📝 Dokumen 3: Pembuatan Sistem Visual Question Answering Berbasis Web Untuk Mendukung Pembelajaran Visual Anak TK Berbahasa Indonesia Menggunakan Deep Learning.
- visual: 0.5292
- answering: 0.2646
- question: 0.2646
- tk: 0.2646
- pembelajaran: 0.2646
- mendukung: 0.2435
- pembuatan: 0.2435
- web: 0.2435
- berbahasa: 0.2435
- anak: 0.2169

📝 Dokumen 4: Analisis Prediksi Faktor Intensitas Tegangan Pa

# Implementasi Cosine Similarity

In [6]:
cos_sim = cosine_similarity(tfidf_test, tfidf_train)

cos_sim_df = pd.DataFrame(cos_sim)

print("Cosine Similarity Matrix berhasil dibuat!")
print(cos_sim_df.head())

Cosine Similarity Matrix berhasil dibuat!
        0         1         2         3         4         5         6    \
0  0.026942  0.000000  0.018912  0.000000  0.000000  0.027162  0.000000   
1  0.018957  0.000000  0.162258  0.000000  0.000000  0.133677  0.000000   
2  0.084182  0.045598  0.018104  0.000000  0.000000  0.026002  0.000000   
3  0.083662  0.128483  0.036878  0.054708  0.026689  0.018595  0.025476   
4  0.157800  0.016780  0.038869  0.000000  0.000000  0.009190  0.000000   

        7         8         9    ...       108       109       110       111  \
0  0.091968  0.025605  0.096974  ...  0.000000  0.026113  0.020650  0.022235   
1  0.000000  0.018016  0.040299  ...  0.083083  0.018373  0.014530  0.062535   
2  0.000000  0.311544  0.000000  ...  0.076908  0.024997  0.019768  0.190763   
3  0.022932  0.179394  0.087163  ...  0.071803  0.017877  0.064123  0.101961   
4  0.021877  0.018873  0.012626  ...  0.007695  0.008835  0.015221  0.007523   

        112       113     

# Fungsi untuk menentukan tingkat kemiripan

In [9]:
results = []

for i, test_row in enumerate(cos_sim):
    top_match_idx = test_row.argmax()
    top_score = test_row[top_match_idx]

    results.append({
        'Test_Index': i,
        'Test_Judul': df_test.loc[i, 'Judul'],
        'Most_Similar_Train_Judul': df_train.loc[top_match_idx, 'Judul'],
        'Similarity_Score': round(top_score, 4)
    })

similarity_df = pd.DataFrame(results)

similarity_df

Unnamed: 0,Test_Index,Test_Judul,Most_Similar_Train_Judul,Similarity_Score
0,0,Implementasi Segmentasi Pakaian Menggunakan Me...,Klasifikasi Tumor Otak Pada Citra MRI Mengguna...,0.2251
1,1,Sistem Deteksi Android Malware Menggunakan Met...,Identifikasi Diabetes Melitus Berdasarkan Biom...,0.2104
2,2,Klasifikasi Penyakit Berdasarkan Foto Rontgen ...,DETEKSI PENYAKIT CABAI MERAH BESAR BERDASARKAN...,0.4951
3,3,Analisis Sentimen Berdasar Aspek Review Kuline...,Analisis Sentimen Dan Clustering Pada Pengguna...,0.3559
4,4,PENGENALAN WAJAH MENGGUNAKAN METODE DEEP NEURA...,Aplikasi Discrete Wavelets Transform pada Anal...,0.336
5,5,Evaluasi Pemeliharaan Extruder Machine Dalam R...,Trajectory Tracking Pada Mobil Autonomous Meng...,0.2318
6,6,Prediksi Data Time Series Multivariat Mengguna...,Multi-Objective Vehicle Routing Problem with T...,0.2474
7,7,Sistem Prediksi Status Stunting dan Severe Stu...,Bayisehatkita: Aplikasi Berbasis Web Untuk Kla...,0.3465
8,8,Klasifikasi Bahasa Isyarat Indonesia Penyandan...,Klasifikasi Gerakan Pencak Silat Menggunakan C...,0.2666
9,9,Kontrol Pergerakan Kursi Roda Berbasis Head Ge...,Pengembangan Kursi Roda Otonom Berbasis YOLOV8...,0.4164


In [16]:
similarity_df.to_csv('hasil_kemiripan_test_vs_train_dengan_abstrak.csv', index=False, encoding='utf-8-sig')

# Evaluasi Model

In [10]:
from sklearn.metrics import confusion_matrix, classification_report

In [11]:
max_cos_sim = np.max(cos_sim, axis=1)

y_true = df_test['Label']
y_pred = (max_cos_sim >= 0.3).astype(int)

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

KeyError: 'Label'

In [43]:
df_test['Similarity_Score'] = max_cos_sim
df_test['Prediksi'] = y_pred

df_test.head()

Unnamed: 0,Judul,Abstrak,Preprocessed_Judul,Preprocessed_Abstrak,Label,Combined_Text,Similarity_Score,Prediksi
0,Deteksi Malware pada Sistem Operasi Android me...,Malware merupakan perangkat lunak berbahaya ya...,deteksi malware sistem operasi android arsitek...,malware perangkat lunak bahaya ganggu sistem a...,1,deteksi malware sistem operasi android arsitek...,0.721305,1
1,Klasifikasi Aplikasi Android Berbasis Analisis...,Penggunaan sistem operasi Android yang terbuka...,klasifikasi aplikasi android bas analisis perm...,guna sistem operasi android buka sebab tingkat...,1,klasifikasi aplikasi android bas analisis perm...,0.289569,0
2,Analisis Deteksi Intrusi pada Jaringan Kompute...,Sistem deteksi intrusi (IDS) penting untuk mel...,analisis deteksi intrusi jaring komputer recur...,sistem deteksi intrusi ids lindung jaring akse...,0,analisis deteksi intrusi jaring komputer recur...,0.134526,0
3,Analisis Kualitas Air Sungai Berdasarkan Kadar...,Penelitian ini bertujuan untuk mengevaluasi ku...,analisis kualitas air sungai dasar kadar tds p...,teliti tuju evaluasi kualitas air sungai kawas...,0,analisis kualitas air sungai dasar kadar tds p...,0.13436,0
4,Rancang Bangun Aplikasi Manajemen Tugas Mahasi...,Proses pengelolaan tugas kuliah seringkali tid...,rancang bangun aplikasi manajemen tugas mahasi...,proses kelola tugas kuliah seringkali organisi...,0,rancang bangun aplikasi manajemen tugas mahasi...,0.12052,0
