In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load data
data = pd.read_csv('email_expense_transactions.csv')

In [3]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['recipient_encoded'] = label_encoder.fit_transform(data['recipient'])
data['account_last4_encoded'] = label_encoder.fit_transform(data['account_last4'])

In [4]:
# Feature selection
features = ['amount', 'recipient_encoded', 'account_last4_encoded']
X = data[features]
# Separate rows with NaN values
nan_rows = data[data[['Description', 'toAccount']].isnull().any(axis=1)]
non_nan_rows = data.dropna(subset=['Description', 'toAccount'])

In [5]:

# Training data
X_train = non_nan_rows[features]
y_train_desc = non_nan_rows['Description']
y_train_acc = non_nan_rows['toAccount']

In [6]:
# Split data for validation (optional)
X_train_desc, X_test_desc, y_train_desc, y_test_desc = train_test_split(X_train, y_train_desc, test_size=0.2, random_state=42)
X_train_acc, X_test_acc, y_train_acc, y_test_acc = train_test_split(X_train, y_train_acc, test_size=0.2, random_state=42)

In [7]:
# Train models
model_desc = RandomForestClassifier()
model_desc.fit(X_train_desc, y_train_desc)

In [8]:
model_acc = RandomForestClassifier()
model_acc.fit(X_train_acc, y_train_acc)

In [9]:
# Predict for NaN rows
X_nan = nan_rows[features]
predicted_desc_nan = model_desc.predict(X_nan)
predicted_acc_nan = model_acc.predict(X_nan)

In [10]:
nan_rows['Predicted_Description'] = predicted_desc_nan
nan_rows['Predicted_toAccount'] = predicted_acc_nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows['Predicted_Description'] = predicted_desc_nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows['Predicted_toAccount'] = predicted_acc_nan


In [11]:
# Evaluate model performance (optional, based on non-NaN test set)
from sklearn.metrics import accuracy_score

y_pred_desc = model_desc.predict(X_test_desc)
desc_accuracy = accuracy_score(y_test_desc, y_pred_desc)

y_pred_acc = model_acc.predict(X_test_acc)
acc_accuracy = accuracy_score(y_test_acc, y_pred_acc)

print(f"Description Prediction Accuracy: {desc_accuracy}")
print(f"toAccount Prediction Accuracy: {acc_accuracy}")

Description Prediction Accuracy: 0.06666666666666667
toAccount Prediction Accuracy: 0.06666666666666667


In [12]:
import plotly.express as px

# Visualize distribution of predicted descriptions
fig_desc = px.bar(
    x=nan_rows['Predicted_Description'].value_counts().index,
    y=nan_rows['Predicted_Description'].value_counts().values,
    labels={'x': 'Predicted Description', 'y': 'Count'},
    title='Distribution of Predicted Descriptions'
)
fig_desc.show()

# Visualize distribution of predicted toAccount
fig_acc = px.bar(
    x=nan_rows['Predicted_toAccount'].value_counts().index,
    y=nan_rows['Predicted_toAccount'].value_counts().values,
    labels={'x': 'Predicted toAccount', 'y': 'Count'},
    title='Distribution of Predicted toAccount'
)
fig_acc.show()


In [13]:
nan_rows_display = nan_rows[['date', 'amount', 'recipient', 'account_last4', 'Description', 'Predicted_Description', 'toAccount', 'Predicted_toAccount']]
nan_rows_display.head()


Unnamed: 0,date,amount,recipient,account_last4,Description,Predicted_Description,toAccount,Predicted_toAccount
0,2024-07-09,30.0,q943975619@ybl,1455,,Chennai Bike - to ACJ,,Travel:Bike
1,2024-07-09,40.0,q506996751@ybl,1455,,Rasam,,Expenses:Food:Groceries
2,2024-07-08,1150.0,1mgtechnologies@paytm,1455,,Stuff,,Expenses:Misc:Entertainment:Stuff
3,2024-07-08,989.0,IND,8004,,Bike Petrolium,,Expenses:Bike:Fuel
4,2024-07-08,590.0,paytm-ptmbbp@paytm,1455,,Ratnadeep Shoppping,,Expenses:Food:EatingOut


In [14]:
nan_rows_display.to_csv('.csv', index=False)


In [58]:
updated_dataset = pd.read_csv('expenses_dataset.csv')

# Encode categorical variables
label_encoder_recipient = LabelEncoder()
updated_dataset['recipient_encoded'] = label_encoder_recipient.fit_transform(updated_dataset['recipient'])
label_encoder_account = LabelEncoder()
updated_dataset['account_last4_encoded'] = label_encoder_account.fit_transform(updated_dataset['account_last4'])
label_encoder_description = LabelEncoder()
updated_dataset['Description_encoded'] = label_encoder_description.fit_transform(updated_dataset['Description'])

In [59]:
# Prepare features and target variable
features = ['amount', 'recipient_encoded', 'account_last4_encoded', 'Description_encoded']
X = updated_dataset[features]

In [60]:
# Separate rows with NaN values
nan_rows = updated_dataset[updated_dataset[['toAccount']].isnull().any(axis=1)]
non_nan_rows = updated_dataset.dropna(subset=['toAccount'])

In [61]:
# Training data
X_train = non_nan_rows[features]
y_train_desc = non_nan_rows['Description']
y_train_acc = non_nan_rows['toAccount']

In [62]:
# Split data for validation (optional)
X_train_desc, X_test_desc, y_train_desc, y_test_desc = train_test_split(X_train, y_train_desc, test_size=0.2, random_state=42)
X_train_acc, X_test_acc, y_train_acc, y_test_acc = train_test_split(X_train, y_train_acc, test_size=0.2, random_state=42)

In [63]:
# Train models
model_desc = RandomForestClassifier()
model_desc.fit(X_train_desc, y_train_desc)

In [64]:
model_acc = RandomForestClassifier()
model_acc.fit(X_train_acc, y_train_acc)

In [65]:
# Predict for NaN rows
X_nan = nan_rows[features]
predicted_desc_nan = model_desc.predict(X_nan)
predicted_acc_nan = model_acc.predict(X_nan)

In [57]:
nan_rows['Predicted_Description'] = predicted_desc_nan
nan_rows['Predicted_toAccount'] = predicted_acc_nan



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [66]:
# Evaluate model performance (optional, based on non-NaN test set)
from sklearn.metrics import accuracy_score

y_pred_desc = model_desc.predict(X_test_desc)
desc_accuracy = accuracy_score(y_test_desc, y_pred_desc)

y_pred_acc = model_acc.predict(X_test_acc)
acc_accuracy = accuracy_score(y_test_acc, y_pred_acc)

print(f"Description Prediction Accuracy: {desc_accuracy}")
print(f"toAccount Prediction Accuracy: {acc_accuracy}")

Description Prediction Accuracy: 0.13333333333333333
toAccount Prediction Accuracy: 0.2


In [68]:
nan_rows_display

Unnamed: 0,date,amount,recipient,account_last4,Description,Predicted_Description,toAccount,Predicted_toAccount
0,2024-07-09,30.0,q943975619@ybl,1455,,Chennai Bike - to ACJ,,Travel:Bike
1,2024-07-09,40.0,q506996751@ybl,1455,,Rasam,,Expenses:Food:Groceries
2,2024-07-08,1150.0,1mgtechnologies@paytm,1455,,Stuff,,Expenses:Misc:Entertainment:Stuff
3,2024-07-08,989.0,IND,8004,,Bike Petrolium,,Expenses:Bike:Fuel
4,2024-07-08,590.0,paytm-ptmbbp@paytm,1455,,Ratnadeep Shoppping,,Expenses:Food:EatingOut
80,2024-06-13,36.0,paytm.s10j8ey@pty,1455,,Vice,,Misc:Entertainment:Vice
81,2024-06-13,985.16,OPENAI,6815,,Obsidian,,Subscriptions


In [22]:
# Split data for training and testing
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new_encoded, test_size=0.2, random_state=42)

# Train the new model
model_acc_new = RandomForestClassifier()
model_acc_new.fit(X_train_new, y_train_new)

ValueError: could not convert string to float: 'paytm.s10j8ey@pty'

In [6]:
# Train models for Description and toAccount
model_desc = RandomForestClassifier()
model_desc.fit(X_train_desc, y_train_desc)

ValueError: Input contains NaN