In [None]:
# Step 1 Data Processing
import pandas as pd

In [None]:
# Convert the CSV file to a DataFrame (Data Processing)
df = pd.read_csv('digital_wallet_ltv_dataset.csv', encoding_errors='ignore')

# Display the structure of the DataFrame
df.shape # (rows, columns)

In [None]:
# Step 2 Data Cleaning
# Change column names to lowercase
df.columns = df.columns.str.lower()

# Check for missing and duplicate values
if df.isnull().values.any() or df.duplicated().values.any():
    df.dropna(inplace = True)

In [None]:
# Step 3 Data Transformation
# Convert monetary values from rupees to CAD
conversion_rate = 0.016  # Example conversion rate from 1 rupees to 0.016 CAD
df['avg_transaction_value'] = df['avg_transaction_value'] * conversion_rate
df['total_spent'] = df['total_spent'] * conversion_rate
df['max_transaction_value'] = df['max_transaction_value'] * conversion_rate
df['min_transaction_value'] = df['min_transaction_value'] * conversion_rate
df['cashback_received'] = df['cashback_received'] * conversion_rate

# Round all columns to two decimal points
df = df.round(2)

# Display first 5 rows after transformation
df.head()

In [None]:
# Step 4 Connect to Database
# Import sqlalchemy
from sqlalchemy import create_engine

In [None]:
# Create SQLAlchemy connection for PostgreSQL
engine = create_engine("postgresql+psycopg2://postgres:1234@localhost:5432/fintech_ltv")

# Check if the connection is successful
try:
    # Connect to PostgreSQL database
    # Write DataFrame to PostgreSQL table named 'fintech_ltv'
    df.to_sql('fintech_ltv', con=engine, if_exists='replace', index=False)
    print("Connection Successful")
except:
    print("Connection Failed")

In [None]:
# Step 5 Exploratory Data Analysis
# Display distribution for every variable
df.describe()

In [None]:
# Import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Create a histogram for LTV distribution
plt.figure(figsize=(10,6))
sns.histplot(df['ltv'], kde=True)
plt.title('LTV Distribution')
plt.xlabel('LTV')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Create boxplots for general population demographic
plt.subplots(3, 3, figsize=(16, 12))
bins = [0, 21, 31, 41, 51, 61]
labels = ['0-20', '21-30', '31-40', '41-50', '51-60']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)
sns.boxplot(x=df['age_group'], y=df['ltv'], ax=plt.subplot(3, 3, 1))
sns.boxplot(x=df['age_group'], y=df['total_spent'], ax=plt.subplot(3, 3, 2))
sns.boxplot(x=df['age_group'], y=df['avg_transaction_value'], ax=plt.subplot(3, 3, 3))
sns.boxplot(x=df['location'], y=df['ltv'], ax=plt.subplot(3, 3, 4))
sns.boxplot(x=df['location'], y=df['total_spent'], ax=plt.subplot(3, 3, 5))
sns.boxplot(x=df['location'], y=df['avg_transaction_value'], ax=plt.subplot(3, 3, 6))
sns.boxplot(x=df['income_level'], y=df['ltv'], ax=plt.subplot(3, 3, 7))
sns.boxplot(x=df['income_level'], y=df['total_spent'], ax=plt.subplot(3, 3, 8))
sns.boxplot(x=df['income_level'], y=df['avg_transaction_value'], ax=plt.subplot(3, 3, 9))
plt.show()

In [None]:
# Create a heatmap of the variables
correlation_matrix = df[['ltv', 'total_transactions', 'avg_transaction_value', 'min_transaction_value', 'max_transaction_value', 'total_spent']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()