In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [2]:
transactions = pd.read_csv('transactions_duplicate_rows.csv')
transactions.head()

Unnamed: 0,id,transaction_date,value_date,reference,debit,credit,balance,remarks,predicted_category,created_at,updated_at
0,1,2024-11-01,2024-11-01,Rent payment,161501.1,,342752.9,Monthly rent,Rent,2025-11-18 12:07:41.455946+00,2025-11-18 12:07:41.455946+00
1,2,2024-11-03,2024-11-03,Internet subscription,13209.58,,329543.32,ISP/Internet,Utilities,2025-11-18 12:07:41.455946+00,2025-11-18 12:07:41.455946+00
2,3,2024-11-05,2024-11-05,Utility bills,34919.28,,294624.04,Electricity/Water/Generator,Utilities,2025-11-18 12:07:41.455946+00,2025-11-18 12:07:41.455946+00
3,4,2024-11-18,2024-11-18,POS PURCHASE - Inventory,234349.77,,60274.27,Stock / Inventory purchase,Inventory,2025-11-18 12:07:41.455946+00,2025-11-18 12:07:41.455946+00
4,5,2024-11-18,2024-11-18,Customer payment,,466197.6,526471.87,Sale / Customer payment,Sales,2025-11-18 12:07:41.455946+00,2025-11-18 12:07:41.455946+00


In [3]:
### Combining the text columns
transactions['text'] = transactions['reference'].fillna('') + ' ' + transactions['remarks'].fillna('')
transactions['text'].head()

0                            Rent payment Monthly rent
1                   Internet subscription ISP/Internet
2            Utility bills Electricity/Water/Generator
3    POS PURCHASE - Inventory Stock / Inventory pur...
4             Customer payment Sale / Customer payment
Name: text, dtype: object

In [4]:
transactions['debit'] = transactions['debit'].fillna(0)

In [5]:
transactions['credit'] = transactions['credit'].fillna(0)

In [6]:
# Text vectorizer
text_vect = TfidfVectorizer(max_features=500)

In [7]:
# Numeric scaler
num_scaler = StandardScaler()

In [8]:
# Column transformer for combining text and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_vect, 'text'),
        ('num', num_scaler, ['debit', 'credit'])
    ])

In [9]:
# Split data
X = transactions[['text', 'debit', 'credit']]
y = transactions['predicted_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Fit preprocessor
preprocessor.fit(X_train)

0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [11]:
# Transform train and test
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [12]:
# Train classifier
clf = RandomForestClassifier()
clf.fit(X_train_transformed, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
y_pred = clf.predict(X_test_transformed)
print(classification_report(y_test, y_pred))

                   precision    recall  f1-score   support

 Fees and charges       1.00      1.00      1.00       150
             Fuel       1.00      1.00      1.00        65
        Inventory       1.00      1.00      1.00       178
  Loan/Investment       1.00      1.00      1.00        66
      Maintenance       1.00      1.00      1.00        94
    Miscellaneous       1.00      1.00      1.00        56
Refund/Chargeback       1.00      1.00      1.00         8
             Rent       1.00      1.00      1.00        21
           Salary       1.00      1.00      1.00        18
            Sales       1.00      1.00      1.00       299
        Utilities       1.00      1.00      1.00        55

         accuracy                           1.00      1010
        macro avg       1.00      1.00      1.00      1010
     weighted avg       1.00      1.00      1.00      1010



In [14]:
accuracy_score(y_test, y_pred)

1.0

In [15]:
joblib.dump(preprocessor, "preprocessor.pkl")

['preprocessor.pkl']

In [16]:
joblib.dump(clf, "transaction_classifier.pkl")

['transaction_classifier.pkl']