In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_train = pd.read_csv('../Dataset/preprocessed_dataset.csv')
df_test = pd.read_csv('../Dataset/preprocessed_datatest.csv')

# Menggabungkan judul dan abstrak supaya representasi dokumen lebih kaya
df_train['Combined_Text'] = df_train['Preprocessed_Judul'] + ' ' + df_train['Preprocessed_Abstrak']
df_test['Combined_Text'] = df_test['Preprocessed_Judul'] + ' ' + df_test['Preprocessed_Abstrak']

# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

#Transformasi teks ke bentuk vektor TF-IDF
tfidf_train = tfidf_vectorizer.fit_transform(df_train['Combined_Text'])
tfidf_test = tfidf_vectorizer.transform(df_test['Combined_Text'])

print(f"TF-IDF Matrix Data Train berhasil dibuat dengan shape: {tfidf_train.shape}")
print(f"TF-IDF Matrix Data Test berhasil dibuat dengan shape: {tfidf_test.shape}")



TF-IDF Matrix Data Train berhasil dibuat dengan shape: (118, 6130)
TF-IDF Matrix Data Test berhasil dibuat dengan shape: (50, 6130)


In [3]:
features = tfidf_vectorizer.get_feature_names_out()
print(features)

['000000492' '000003964' '00005' ... 'zaman' 'zoho' 'zoning']


In [4]:
dense_tfidf = tfidf_train.todense()

tfidf_df = pd.DataFrame(dense_tfidf, columns=features)
tfidf_df.head()

Unnamed: 0,000000492,000003964,00005,0001,0002801,0006,001060071,001312,0019,0032,...,yet,yield,yielded,yolo,yolov8,yolov8n,you,zaman,zoho,zoning
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Fungsi untuk cari Top-N kata dari satu dokumen
def get_top_n_words(row, n=10):
    sorted_indices = np.argsort(row)[::-1]  # Urutkan dari terbesar
    top_features = [(features[i], row[i]) for i in sorted_indices[:n]]
    return top_features

# Ambil Top 10 kata per dokumen
top_words_per_doc = []

for i, row in enumerate(tfidf_df.values):
    top_words = get_top_n_words(row, n=10)
    top_words_per_doc.append(top_words)

# Tampilkan hasil
for idx, top_words in enumerate(top_words_per_doc):
    print(f"\n📝 Dokumen {idx+1}: {df_train.loc[idx, 'Judul']}")
    for word, score in top_words:
        print(f"- {word}: {score:.4f}")


📝 Dokumen 1: Klasifikasi Aritmia Sinyal ECG Menggunakan Transformasi Wavelet Dan Analisa Statistik.
- ecg: 0.4170
- jantung: 0.3033
- aritmia: 0.2780
- qrs: 0.2780
- depolarisasi: 0.2780
- atrium: 0.2780
- gelombang: 0.2558
- ventrikel: 0.2558
- rekam: 0.2179
- aktivitas: 0.1958

📝 Dokumen 2: Klasifikasi Jenis Tumor Otak Meningioma, Glioma, Dan Pituitari Berbasis Hybrid Vgg-16 Dan Svm Untuk Diagnosis Praoperasi.
- tumor: 0.5359
- vgg16: 0.2543
- 9667: 0.1773
- otak: 0.1737
- and: 0.1569
- the: 0.1471
- brain: 0.1390
- machine: 0.1330
- of: 0.1088
- learning: 0.1080

📝 Dokumen 3: Pembuatan Sistem Visual Question Answering Berbasis Web Untuk Mendukung Pembelajaran Visual Anak TK Berbahasa Indonesia Menggunakan Deep Learning.
- visual: 0.4156
- question: 0.4033
- answering: 0.4033
- elearning: 0.2474
- bahasa: 0.1467
- the: 0.1413
- 100: 0.1378
- blip: 0.1344
- nllb: 0.1344
- ajar: 0.1341

📝 Dokumen 4: Analisis Prediksi Faktor Intensitas Tegangan Pada Sambungan Tubular Jacket Platform Be

# Implementasi Cosine Similarity

In [6]:
cos_sim = cosine_similarity(tfidf_test, tfidf_train)

cos_sim_df = pd.DataFrame(cos_sim)

print("Cosine Similarity Matrix berhasil dibuat!")
print(cos_sim_df.head())

Cosine Similarity Matrix berhasil dibuat!
       0         1         2         3         4         5         6    \
0  0.00000  0.178326  0.180257  0.133395  0.116256  0.138175  0.024718   
1  0.00000  0.105875  0.085969  0.066399  0.083455  0.123226  0.021540   
2  0.00234  0.200453  0.109062  0.159960  0.157278  0.202879  0.028573   
3  0.01242  0.152864  0.081826  0.106114  0.096376  0.145333  0.026314   
4  0.06807  0.052982  0.030658  0.044635  0.036265  0.051311  0.009333   

        7         8         9    ...       108       109       110       111  \
0  0.171997  0.096301  0.173092  ...  0.146674  0.116767  0.128420  0.111271   
1  0.083142  0.087068  0.109592  ...  0.099658  0.051530  0.058287  0.081371   
2  0.196146  0.192715  0.141160  ...  0.143936  0.153299  0.142918  0.175469   
3  0.124868  0.167348  0.114891  ...  0.093600  0.084800  0.133490  0.113975   
4  0.050932  0.037496  0.045987  ...  0.048420  0.025689  0.032732  0.046313   

        112       113       114 

# Fungsi untuk menentukan tingkat kemiripan

In [15]:
results = []

for i, test_row in enumerate(cos_sim):
    top_match_idx = test_row.argmax()
    top_score = test_row[top_match_idx]

    results.append({
        'Test_Index': i,
        'Test_Judul': df_test.loc[i, 'Judul'],
        'Most_Similar_Train_Judul': df_train.loc[top_match_idx, 'Judul'],
        'Test_Abstrak': df_test.loc[i, 'Abstrak'],
        'Most_Similar_Train_Abstrak': df_train.loc[top_match_idx, 'Abstrak'],
        'Similarity_Score': round(top_score, 4)
    })

similarity_df = pd.DataFrame(results)

similarity_df

Unnamed: 0,Test_Index,Test_Judul,Most_Similar_Train_Judul,Test_Abstrak,Most_Similar_Train_Abstrak,Similarity_Score
0,0,Implementasi Segmentasi Pakaian Menggunakan Me...,Klasifikasi Tumor Otak Pada Citra MRI Mengguna...,Implementasi Mask R-CNN merupakan pendekatan u...,Tumor otak adalah salah satu penyakit yang pal...,0.3413
1,1,Sistem Deteksi Android Malware Menggunakan Met...,Rancang Bangun Stetoskop Elektronik Berbasis A...,Malware merupakan perangkat lunak jahat yang d...,Penyakit jantung merupakan salah satu penyebab...,0.2124
2,2,Klasifikasi Penyakit Berdasarkan Foto Rontgen ...,Analisis Sentimen Masyarakat Indonesia Mengena...,Pada tahun 2019 seluruh dunia terdampak pandem...,World Health Organization (WHO) mendeklarasi-k...,0.3216
3,3,Analisis Sentimen Berdasar Aspek Review Kuline...,Analisis Sentimen Dan Clustering Pada Pengguna...,Makanan merupakan hal yang selalu dekat dengan...,Tiket.com adalah perusahaan agen untuk pelayan...,0.3009
4,4,PENGENALAN WAJAH MENGGUNAKAN METODE DEEP NEURA...,Identifikasi dan Klasifikasi Tingkat Ketidak...,Metode pengenalan identitas dilakukan dengan m...,Tugas akhir ini membahas tentang pengidentifik...,0.2484
5,5,Evaluasi Pemeliharaan Extruder Machine Dalam R...,Sistem Klasifikasi Kondisi Motor Dust Collecto...,PT. WTR merupakan perusahaan yang bergerak dal...,Perusahaan Pestisida MSI merupakan salah satu ...,0.2518
6,6,Prediksi Data Time Series Multivariat Mengguna...,Model Traffic Forecasting dengan RNN-Based Dee...,Prediksi data deret waktu multivariat banyak d...,Peningkatan konsep smart city yang didorong ol...,0.3301
7,7,Sistem Prediksi Status Stunting dan Severe Stu...,Bayisehatkita: Aplikasi Berbasis Web Untuk Kla...,Stunting merupakan salah satu kondisi kesehata...,Malnutrisi merupakan permasalahan umum yang ma...,0.7604
8,8,Klasifikasi Bahasa Isyarat Indonesia Penyandan...,Deteksi Dini Financial Distress Pada Perusahaa...,Bahasa merupakan sebuah media untuk berkomunik...,Kondisi ekonomi dan geopolitik di Indonesia di...,0.1727
9,9,Kontrol Pergerakan Kursi Roda Berbasis Head Ge...,Pengembangan Kursi Roda Otonom Berbasis YOLOV8...,Kuadriplegia atau tetraplegia merupakan kelump...,Pengembangan kursi roda otonom telah menjadi s...,0.3735


In [16]:
similarity_df.to_csv('hasil_kemiripan_test_vs_train_dengan_abstrak.csv', index=False, encoding='utf-8-sig')

# Evaluasi Model

In [10]:
from sklearn.metrics import confusion_matrix, classification_report

In [11]:
max_cos_sim = np.max(cos_sim, axis=1)

y_true = df_test['Label']
y_pred = (max_cos_sim >= 0.3).astype(int)

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

KeyError: 'Label'

In [43]:
df_test['Similarity_Score'] = max_cos_sim
df_test['Prediksi'] = y_pred

df_test.head()

Unnamed: 0,Judul,Abstrak,Preprocessed_Judul,Preprocessed_Abstrak,Label,Combined_Text,Similarity_Score,Prediksi
0,Deteksi Malware pada Sistem Operasi Android me...,Malware merupakan perangkat lunak berbahaya ya...,deteksi malware sistem operasi android arsitek...,malware perangkat lunak bahaya ganggu sistem a...,1,deteksi malware sistem operasi android arsitek...,0.721305,1
1,Klasifikasi Aplikasi Android Berbasis Analisis...,Penggunaan sistem operasi Android yang terbuka...,klasifikasi aplikasi android bas analisis perm...,guna sistem operasi android buka sebab tingkat...,1,klasifikasi aplikasi android bas analisis perm...,0.289569,0
2,Analisis Deteksi Intrusi pada Jaringan Kompute...,Sistem deteksi intrusi (IDS) penting untuk mel...,analisis deteksi intrusi jaring komputer recur...,sistem deteksi intrusi ids lindung jaring akse...,0,analisis deteksi intrusi jaring komputer recur...,0.134526,0
3,Analisis Kualitas Air Sungai Berdasarkan Kadar...,Penelitian ini bertujuan untuk mengevaluasi ku...,analisis kualitas air sungai dasar kadar tds p...,teliti tuju evaluasi kualitas air sungai kawas...,0,analisis kualitas air sungai dasar kadar tds p...,0.13436,0
4,Rancang Bangun Aplikasi Manajemen Tugas Mahasi...,Proses pengelolaan tugas kuliah seringkali tid...,rancang bangun aplikasi manajemen tugas mahasi...,proses kelola tugas kuliah seringkali organisi...,0,rancang bangun aplikasi manajemen tugas mahasi...,0.12052,0
