# **Cleaning**

In [94]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [95]:
from google.colab import drive
drive.mount('/content/drive')

# Memanggil data yang berada di Google Drive
file_path = '/content/drive/MyDrive/Colab/Ulasan_Coretax.csv'
df = pd.read_csv(file_path)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,created_at,full_text,username
0,Thu Jan 30 22:08:37 +0000 2025,Kok bisa yah anggaran 1 2T cuma bisa bikin web...,velvsx
1,Thu Jan 30 13:31:18 +0000 2025,Baru muji coretax lancar abis itu gak bisa bua...,labakotor
2,Wed Jan 29 01:57:34 +0000 2025,Coretax itu mirip sama Tugas Akhir mahasiswa S...,HebatPaijo
3,Sat Jan 25 04:53:18 +0000 2025,terpantau hari ini coretax lancar jaya @kring_...,na7ahachikyu
4,Fri Jan 24 03:39:20 +0000 2025,coretax gak jelas. ngehambat semua orang buat ...,saturncz


In [96]:
# Cek info data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18528 entries, 0 to 18527
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   created_at  18528 non-null  object
 1   full_text   18528 non-null  object
 2   username    17520 non-null  object
dtypes: object(3)
memory usage: 434.4+ KB


In [97]:
# Cek nilai duplikasi dalam data
df.duplicated().sum()

np.int64(3)

In [98]:
# 1. Hapus baris dengan username null
df = df.dropna(subset=['username'])

# 2. Hapus baris dengan username 'kring_pajak'
df = df[df['username'].str.lower() != 'kring_pajak']

# 3. Hapus duplikasi
df = df.drop_duplicates()

# 4. Reset index agar rapi
df = df.reset_index(drop=True)

# 5. Ubah kolom created_at ke format datetime yang benar
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# 6. Drop baris jika ada created_at yang gagal diubah (jadi NaT)
df = df.dropna(subset=['created_at'])

# 7. Ubah format tanggal menjadi 'dd Mon yyyy'
df['created_at'] = df['created_at'].dt.strftime('%d %b %Y')

# 8. Rename kolom agar konsisten
df.rename(columns={
    'created_at': 'tanggal',
    'full_text': 'ulasan',
    'username': 'username'
}, inplace=True)
df.head()

Unnamed: 0,tanggal,ulasan,username
0,30 Jan 2025,Kok bisa yah anggaran 1 2T cuma bisa bikin web...,velvsx
1,30 Jan 2025,Baru muji coretax lancar abis itu gak bisa bua...,labakotor
2,29 Jan 2025,Coretax itu mirip sama Tugas Akhir mahasiswa S...,HebatPaijo
3,25 Jan 2025,terpantau hari ini coretax lancar jaya @kring_...,na7ahachikyu
4,24 Jan 2025,coretax gak jelas. ngehambat semua orang buat ...,saturncz


In [99]:
import pandas as pd
import re
import string

def clean_text(text):
  if isinstance(text, str):

    # Mengubah teks menjadi huruf kecil
    text = text.lower()

    #Mengganti simbol menjadi spasi
    text = text.replace('.','').replace('/','').replace(',','').replace('-','')

    # Menghapus mention
    text = re.sub(r'@\w+', '', text)

    # Menghapus hashtag
    text = re.sub(r'#\w+', '', text)

    # Menghapus URL/link
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Menghapus angka
    text = re.sub(r'\d+', '', text)

    # Menghapus emoji dan karakter non-ASCII
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Menghapus karakter non-alphanumeric
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Menghapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Menghapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    return text
  else:
    return ''

# Terapkan ke kolom ulasan
df['ulasan'] = df['ulasan'].fillna('')
df ['cleaning'] = df ['ulasan'].apply(clean_text)
df.head()

Unnamed: 0,tanggal,ulasan,username,cleaning
0,30 Jan 2025,Kok bisa yah anggaran 1 2T cuma bisa bikin web...,velvsx,kok bisa yah anggaran t cuma bisa bikin web se...
1,30 Jan 2025,Baru muji coretax lancar abis itu gak bisa bua...,labakotor,baru muji coretax lancar abis itu gak bisa bua...
2,29 Jan 2025,Coretax itu mirip sama Tugas Akhir mahasiswa S...,HebatPaijo,coretax itu mirip sama tugas akhir mahasiswa s...
3,25 Jan 2025,terpantau hari ini coretax lancar jaya @kring_...,na7ahachikyu,terpantau hari ini coretax lancar jaya moga ha...
4,24 Jan 2025,coretax gak jelas. ngehambat semua orang buat ...,saturncz,coretax gak jelas ngehambat semua orang buat k...


In [100]:
# Cek info data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13413 entries, 0 to 13412
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tanggal   13413 non-null  object
 1   ulasan    13413 non-null  object
 2   username  13413 non-null  object
 3   cleaning  13413 non-null  object
dtypes: object(4)
memory usage: 419.3+ KB


In [103]:
from google.colab import drive
drive.mount('/content/drive')

# Simpan ke folder di Google Drive
df.to_csv('/content/drive/MyDrive/Colab/(1)Hasil_Cleaning.csv', index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
