In [33]:
import pandas as pd

# Memuat dataset
file_name = 'Daily Transactions.csv'  # Change this to the correct file name if different
data = pd.read_csv(file_name)
# Memeriksa informasi dataset
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2461 entries, 0 to 2460
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            2461 non-null   object 
 1   Mode            2461 non-null   object 
 2   Category        2461 non-null   object 
 3   Subcategory     1826 non-null   object 
 4   Note            1940 non-null   object 
 5   Amount          2461 non-null   float64
 6   Income/Expense  2461 non-null   object 
 7   Currency        2461 non-null   object 
dtypes: float64(1), object(7)
memory usage: 153.9+ KB
None
                  Date                   Mode        Category  \
0  20/09/2018 12:04:08                   Cash  Transportation   
1  20/09/2018 12:03:15                   Cash            Food   
2           19/09/2018  Saving Bank account 1    subscription   
3  17/09/2018 23:41:17  Saving Bank account 1    subscription   
4  16/09/2018 17:15:08                   Cash       Festival

In [46]:
# Fungsi untuk mengkategorikan transaksi
def categorize_transaction(category, amount):
    primary_keywords = ['food', 'family', 'household', 'health', 'self-development', 'education', 'rent']
    secondary_keywords = ['transportation', 'funding', 'life insurance', 'beauty', 'maid', 'money transfer', 'recurring deposit', 'tourism', 'investment']
    tertiary_keywords = ['subscription', 'festivals', 'apparel', 'gift', 'culture', 'other']

    category = category.lower()
    if any(keyword in category for keyword in primary_keywords):
        return 'Primary'
    elif any(keyword in category for keyword in secondary_keywords):
        return 'Secondary'
    elif any(keyword in category for keyword in tertiary_keywords):
        return 'Tertiary'
    else:
        # Fallback category
        return 'Secondary' if amount < 100 else 'Tertiary'

# Menerapkan fungsi kategorisasi
data['Category_Type'] = data.apply(lambda row: categorize_transaction(row['Category'], row['Amount']), axis=1)

# Menghitung persentase setiap kategori
category_counts = data['Category_Type'].value_counts(normalize=True) * 100
print(category_counts)


Category_Type
Primary      51.686306
Secondary    24.380333
Tertiary     23.933360
Name: proportion, dtype: float64


In [47]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Mempersiapkan data
categories = data['Category'].values
category_types = data['Category_Type'].values

# Tokenisasi kategori
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(categories)
sequences = tokenizer.texts_to_sequences(categories)
padded_sequences = pad_sequences(sequences, maxlen=100)

# Encode kategori tipe
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(category_types)

# Membagi data menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, encoded_labels, test_size=0.2, random_state=42)

# Membangun model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=1000, output_dim=64, input_length=100),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

# Mengompilasi model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Melatih model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Mengevaluasi model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 83.37%
