# Tugas 2 PPW : Pre Processing Data PTA Trunojoyo

In [1]:
import numpy as np
import pandas as pd

## Import Data

In [70]:
df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataPTAInformatika.csv')
df.head()

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",Gerak pekerja ada pada game yang memiliki genr...,Jurusan Teknik Informatika
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.","Perkembangan game yang semakin pesat, memberik...",Jurusan Teknik Informatika
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",Sistem pengenalan wajah adalah suatu sistem un...,Jurusan Teknik Informatika
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",Teknologi mobile game beroperating system open...,Jurusan Teknik Informatika
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",Kantor Badan Kepegawaian kota Bangkalan adalah...,Jurusan Teknik Informatika


## Pre Processing Data

### Cek Data yang Kosong

In [71]:
df.isnull().sum()

Judul            0
Nama Penulis     0
Pembimbing I     0
Pembimbing II    0
Abstrak          5
Prodi            0
dtype: int64

### Menghapus Data yang Kosong

In [72]:
df.dropna(inplace=True)

### Cleaning Data

#### Membuat Fungsi Cleaning Data
- Tag HTML
- LowerCase Data
- Spasi pada teks
- Tanda baca dan karakter spesial
- Nomor
- Komponen Lainnya

In [85]:
import re, string

# Text Cleaning
def cleaning(text):
    # Menghapus tag HTML
    text = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});').sub('', str(text))

    # Mengubah seluruh teks menjadi huruf kecil
    text = text.lower()

    # Menghapus spasi pada teks
    text = text.strip()

    # Menghapus Tanda Baca, karakter spesial, and spasi ganda
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub("â", "", text)

    # Menghapus Nomor
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Mengubah text yang berisi 'nan' dengan whitespace agar nantinya dapat dihapus
    text = re.sub('nan', '', text)

    return text

#### Implementasi Fungsi Pada Data Frame Abstrak

In [74]:
df['Abstrak'] = df['Abstrak'].apply(lambda x: cleaning(x))

df.head()

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",gerak pekerja ada pada game yang memiliki genr...,Jurusan Teknik Informatika
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.",perkembangan game yang semakin pesat memberika...,Jurusan Teknik Informatika
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",sistem pengenalan wajah adalah suatu sistem un...,Jurusan Teknik Informatika
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",teknologi mobile game beroperating system open...,Jurusan Teknik Informatika
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",kantor badan kepegawaian kota bangkalan adalah...,Jurusan Teknik Informatika


### Tokenisasi Data
Memisahkan sebuah Dokumen menjadi susunan per kata / term

#### Import Library NLTK

In [86]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

#### Implementasi Library pada Data

In [76]:
df['abstrak_tokens'] = df['Abstrak'].apply(lambda x: word_tokenize(x))
df[["Abstrak", "abstrak_tokens"]].head()

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak pekerja ada pada game yang memiliki genr...,"[gerak, pekerja, ada, pada, game, yang, memili..."
1,perkembangan game yang semakin pesat memberika...,"[perkembangan, game, yang, semakin, pesat, mem..."
2,sistem pengenalan wajah adalah suatu sistem un...,"[sistem, pengenalan, wajah, adalah, suatu, sis..."
3,teknologi mobile game beroperating system open...,"[teknologi, mobile, game, beroperating, system..."
4,kantor badan kepegawaian kota bangkalan adalah...,"[kantor, badan, kepegawaian, kota, bangkalan, ..."


### Stopword Data
Mengubah isi dokumen sesuai dengan kamus data

#### Import Library NLTK

In [77]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Implementasi Library pada Data

In [78]:
from nltk.corpus import stopwords
from itertools import chain

stop_words = set(chain(stopwords.words('indonesian'), stopwords.words('english')))

df['abstrak_tokens'] = df['abstrak_tokens'].apply(lambda x: [w for w in x if not w in stop_words])

In [79]:
df[["Abstrak", "abstrak_tokens"]].head()

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak pekerja ada pada game yang memiliki genr...,"[gerak, pekerja, game, memiliki, genre, rts, r..."
1,perkembangan game yang semakin pesat memberika...,"[perkembangan, game, pesat, alternative, pemin..."
2,sistem pengenalan wajah adalah suatu sistem un...,"[sistem, pengenalan, wajah, sistem, mengenali,..."
3,teknologi mobile game beroperating system open...,"[teknologi, mobile, game, beroperating, system..."
4,kantor badan kepegawaian kota bangkalan adalah...,"[kantor, badan, kepegawaian, kota, bangkalan, ..."


### Steaming Data
Mengubah kata menjadi bentuk dasar

#### Import Library Sastrawi

In [87]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


#### Implementasi Library pada Data

In [88]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm.auto import tqdm
tqdm.pandas()

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [89]:
df['abstrak_tokens'] = df['abstrak_tokens'].progress_apply(lambda x: stemmer.stem(' '.join(x)).split(' '))

  0%|          | 0/140 [00:00<?, ?it/s]

## Ekstraksi Fitur

### One Hot Encoding

In [90]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
X_encoder = encoder.fit_transform(df[['Abstrak']])

encoded_features = encoder.get_feature_names_out(input_features=["Abstrak"])
one_hot_df = pd.DataFrame(X_encoder.toarray(), columns=encoded_features)
print(one_hot_df)

     Abstrak_akurasi adalah faktor terpenting yang harus dipertimbangkan dalam memilih metode klasifikasi data semakin tinggi akurasi suatu metode klasifikasi data semakin layak metode tersebut diterapkan pada kasus diagnosis penyakit liver metode k nearest neighbor knn menunjukkan akurasi yang tinggi di sisi lain beberapa penelitian menunjukkan bahwa penerapan diskretisasi berbasis entropy sebagai metode preprocessing mampu meningkatkan akurasi klasifikasi data kareya dibangun sebuah hipotesis bahwa penerapan diskretisasi berbasis entropy juga mampu meningkatkan akurasi klasifikasi data pada kasus diagnosis penyakit liver dalam penelitian ini diskretisasi berbasis entropy diterapkan sebagai metode preprocessing sedangkan knn digunakan sebagai metode klasifikasi klasifikasi dilakukan terhadap data set awal dan data set yang telah melalui proses diskretisasi berbasis entropy kemudian akurasi yang dihasilkan oleh klasifikasi terhadap data set awal dibandingkan dengan akurasi yang dihasil

In [81]:
one_hot_df.to_csv('Data_OneHotEncoder.csv', index=False)

### TF-IDF

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['Abstrak'].tolist())

terms = vectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data = X_tfidf.toarray(),columns = terms)
df_tfidfvect

Unnamed: 0,aam,ability,abjad,absolute,abstract,abstrak,acak,acap,accuracy,active,...,yakersuda,yakni,yale,yan,yang,yangcukup,yangmemperoleh,yangperlu,yangtelah,yogyakarta
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.084231,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.051402,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.040454,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.016879,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.039359,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.061890,0.0,0.0,0.0,0.0,0.0
136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.147951,0.0,0.0,0.0,0.0,0.0
137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.171720,0.0,0.0,0.0,0.0,0.0
138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.076239,0.0,0.063774,0.0,0.0,0.0,0.0,0.0


In [83]:
df_tfidfvect.to_csv('Data_TF-IDF.csv', index=False)

### Term Frekuensi

In [98]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(df['Abstrak'].tolist())

terms_count = count_vectorizer.get_feature_names_out()
df_countvect = pd.DataFrame(data = X_count.toarray(),columns = terms_count)
df_countvect

Unnamed: 0,aam,ability,abjad,absolute,abstract,abstrak,acak,acap,accuracy,active,...,yakersuda,yakni,yale,yan,yang,yangcukup,yangmemperoleh,yangperlu,yangtelah,yogyakarta
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,6,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,0,0,0,0
136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,10,0,0,0,0,0
137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,11,0,0,0,0,0
138,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,4,0,0,0,0,0


In [99]:
token_counts = df_countvect.sum(axis=0)

non_zero_token_counts = token_counts[token_counts != 0]

print("Token Counts yang Tidak Sama dengan 0:")
print(non_zero_token_counts)

Token Counts yang Tidak Sama dengan 0:
aam               2
ability           1
abjad             2
absolute          4
abstract          1
                 ..
yangcukup         1
yangmemperoleh    1
yangperlu         1
yangtelah         1
yogyakarta        3
Length: 3423, dtype: int64


In [100]:
df_countvect.to_csv('Data_CountVectorize.csv', index=False)

### Log Frekuensi

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
log_vectorizer = TfidfVectorizer(use_idf=False, smooth_idf=False, norm=None)
X_log = log_vectorizer.fit_transform(df['Abstrak'].tolist())
# X_log = log_vectorizer.fit_transform(df['Abstrak'])

log_terms = log_vectorizer.get_feature_names_out()
df_log = pd.DataFrame(data = X_log.toarray(),columns = log_terms)
df_log

Unnamed: 0,aam,ability,abjad,absolute,abstract,abstrak,acak,acap,accuracy,active,...,yakersuda,yakni,yale,yan,yang,yangcukup,yangmemperoleh,yangperlu,yangtelah,yogyakarta
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0
137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0
138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0


In [96]:
token_counts = df_log.sum(axis=0)

non_zero_token_counts = token_counts[token_counts != 0]

print("Token Counts yang Tidak Sama dengan 0:")
print(non_zero_token_counts)

Token Counts yang Tidak Sama dengan 0:
aam               2.0
ability           1.0
abjad             2.0
absolute          4.0
abstract          1.0
                 ... 
yangcukup         1.0
yangmemperoleh    1.0
yangperlu         1.0
yangtelah         1.0
yogyakarta        3.0
Length: 3423, dtype: float64


In [97]:
df_log.to_csv('Data_LogFrekuensi.csv', index=False)