In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans  
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

from mlxtend.frequent_patterns import apriori, association_rules  
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth  


In [None]:
cre_data = pd.read_csv('credit_card_transactions.csv')
cre_data

In [None]:
cre_data.info()

## **EDA**

### **4.1 Basic EDA Questions**

#### **1. What are the top spending categories by transaction volume and amount?**

In [None]:
# top_amount = cre_data.groupby('category')['tran_amt'].sum().sort_values(ascending=False)

# top_volume = cre_data.groupby('category')['trans_num'].count().sort_values(ascending=False)

In [None]:
top_spend_category = cre_data.groupby('category').agg({'tran_amt': 'sum', 'trans_num': 'count'})
top_spend_category = top_spend_category.sort_values(by= 'tran_amt', ascending=False)
print(top_spend_category.head(5))

#### **2. How does spending vary over time (daily, weekly, monthly trends)?**

In [None]:
# transfer trans_date column to date format

cre_data['trans_date'] = pd.to_datetime(cre_data['trans_date'], dayfirst=True)

# extract time components and create columns for day, week, month
cre_data['day'] = cre_data['trans_date'].dt.date
cre_data['week'] = cre_data['trans_date'].dt.to_period('W')
cre_data['month'] = cre_data['trans_date'].dt.to_period('M')


daily_trend = cre_data.groupby('day')['tran_amt'].sum()
# print(daily_trend)
weekly_trend = cre_data.groupby('week')['tran_amt'].sum()
# print('\n',weekly_trend)
monthly_trend = cre_data.groupby('month')['tran_amt'].sum()
# print('\n',monthly_trend)


plt.figure(figsize=(12,5))
plt.plot(daily_trend, label='Daily Spending', color='blue', alpha=0.5)  
plt.plot(weekly_trend, label='Weekly Spending', color='green')  
plt.plot(monthly_trend, label='Monthly Spending', color='red')  
plt.xlabel('Time')  
plt.ylabel('Total Spending')  
plt.title('Spending Trends Over Time')  
plt.legend()  
plt.show()  

#### **3. What are the peak spending hours in a day?**

In [None]:
# convert unix_time column to date time format

cre_data['trans_time'] = pd.to_datetime(cre_data['unix_time'], unit='s')

#extract hour from trans time
cre_data['hour'] = cre_data['trans_time'].dt.hour

peak_hrs = cre_data.groupby('hour')['tran_amt'].sum().sort_values(ascending=False)
# print(peak_hrs.head(5))

peak_hrs.plot(kind='bar', color='purple', figsize=(10,5))  
plt.xlabel('Hour of Day')  
plt.ylabel('Total Spending')  
plt.title('Peak Hours Pattern by Hour of Day')  
plt.show()

#### **4. What is the most common payment method used?**

##### In this dataset we have no column for payment mode. However we assume, the dataset is for credit card that's why only payment mode is credit card

#### **5. Which merchants have the highest transactions?**

In [None]:
top_mer = cre_data.groupby('merchant')['tran_amt'].count().sort_values(ascending=False)
# print(top_mer.head(5))

top_mer.head(5).plot(kind='bar', y=['transaction_count'], figsize=(10,5))  
plt.xlabel('Merchant')  
plt.ylabel('Count')  
plt.title('Top Merchants by Transactions')  
plt.xticks(rotation=0)
plt.show() 

### **4.2 Intermediate EDA Questions**

#### **1. Are there seasonal trends in spending across different categories?**

In [None]:
cre_data['season'] = cre_data['trans_date'].dt.month.map({12: 'Winter', 1: 'Winter', 2: 'Winter',  3: 'Spring', 4: 'Spring', 5: 'Spring',  
    6: 'Summer', 7: 'Summer', 8: 'Summer',  9: 'Fall', 10: 'Fall', 11: 'Fall'})

seasonal_trnd = cre_data.groupby(['season', 'category'])['tran_amt'].sum().unstack()


seasonal_trnd.T.plot(kind='bar', figsize=(12,5), colormap='viridis')  
plt.xlabel('Category')  
plt.ylabel('Total Spending')  
plt.title('Seasonal Spending Trends Across Categories')  
plt.legend(title='Season')  
plt.xticks(rotation=20)
plt.show()

In [None]:
# monthly_trends = cre_data.groupby([ 'month', 'category'])['tran_amt'].sum().unstack()
# monthly_trends.plot(figsize=(12,6), colormap='coolwarm')  
# plt.xlabel('Month-Year')  
# plt.ylabel('Total Spending')  
# plt.title('Monthly Spending Trends Across Categories')  
# plt.show()

#### **2. How do different customer segments (high spenders vs. low spenders) behave?**

In [None]:
#total spending per customer
cus_spend = cre_data.groupby('cus_id')['tran_amt'].sum().reset_index()
cus_spend

#define segment for high or low spenders
thres_high = cus_spend['tran_amt'].quantile(0.8)
thres_low = cus_spend['tran_amt'].quantile(0.2)

cre_data['spender_seg'] = cre_data['cus_id'].map(lambda x:'High' 
                                                if cus_spend[cus_spend['cus_id'] == x]['tran_amt'].values[0] >= thres_high
                                                else 'Low'
                                                if cus_spend[cus_spend['cus_id'] == x]['tran_amt'].values[0] <= thres_low
                                                else 'Mid')

segments = cre_data.groupby('spender_seg')['tran_amt'].sum().unstack()
segments

#### **3. What is the distribution of transaction amounts (e.g., histogram, boxplot analysis)?**

In [None]:
#histogram
plt.figure(figsize=(10, 5))  
sns.histplot(cre_data['tran_amt'], bins=50, kde=True, color='blue')  
plt.xlabel('Transaction Amount')  
plt.ylabel('Frequency')  
plt.title('Histogram of Transaction Amounts')  
plt.show()

In [None]:
#boxplot
plt.figure(figsize=(8, 5))  
sns.boxplot(x=cre_data['tran_amt'], color='red')  
plt.xlabel('Transaction Amount')  
plt.title('Boxplot of Transaction Amounts')  
plt.show()

#### **4. Are there any correlations between spending behavior and location?**

In [None]:
correlation = cre_data[['tran_amt', 'city_pop']].corr()
print(correlation)

plt.figure(figsize=(8, 5))  
sns.scatterplot(data=cre_data, x='city_pop', y='tran_amt', alpha=0.5)  
plt.xlabel("City Population")  
plt.ylabel("Transaction Amount")  
plt.title("Correlation Between City Population & Spending")  
plt.show()

#### **5. How does spending behavior change before and after payday?**

In [None]:
#define payday
cre_data['payday_period'] = cre_data['day'].apply(lambda x: 
                                                  'Before' if x in range(2,15) else
                                                 'After' if x in range(15,31) else
                                                 'Payday')

payday_spending = cre_data.groupby('payday_period')['trans_amt'].mean().reset_index()

plt.figure(figsize=(8, 5))  
sns.barplot(data=payday_spending, x='payday_period', y='tran_amt', palette='coolwarm')  
plt.xlabel("Period")  
plt.ylabel("Avg Transaction Amount")  
plt.title("Spending Before & After Payday")  
plt.show()

### 4.3 Advanced EDA Questions

#### **1. Can we detect outliers in spending behavior using anomaly detection?**

In [None]:
Q1 = df['tran_amt'].quantile(0.25)  
Q3 = df['tran_amt'].quantile(0.75)  
IQR = Q3 - Q1  

lower_bound = Q1 - 1.5 * IQR  
upper_bound = Q3 + 1.5 * IQR  

cre_data['is_anomaly'] = cre_data['tran_amt'].apply(lambda x: 1 if x < lower_bound or x > upper_bound else 0)  

outliers = cre_data[cre_data['is_anomaly'] == 1]  
print(outliers[['cus_id', 'tran_amt']])

#### **2. Are there clusters of customers based on spending habits? (Use K-Means, DBSCAN)**

##### Customer Clustering Based on Spending Habits (K-Means, DBSCAN)

###### Choose relevant columns for analysing spending habits
###### - Total Spending: tran_amt
###### - Transaction Frequency: cus_id ke transaction count
###### - Average Transaction Amount

In [None]:
#data preparation
 

# Total spending per customer  
cus_spending = cre_data.groupby('cus_id')['tran_amt'].sum().reset_index()  
cus_spending.rename(columns={'tran_amt': 'total_spent'}, inplace=True)  

# Transaction count per customer  
cus_spending['transaction_count'] = cre_data.groupby('cus_id')['tran_amt'].count().values  

# Average spending per transaction  
cus_spending['avg_spent'] = cus_spending['total_spent'] / cus_spending['transaction_count']  

print(cus_spending.head())

In [None]:
#K-Means  

# Data Scaling (K-Means sensitive hota hai scale pe)  
scaler = StandardScaler()  
scaled_data = scaler.fit_transform(cus_spending[['total_spent', 'transaction_count', 'avg_spent']])  

# K-Means Model with 3 Clusters  
kmeans = KMeans(n_clusters=3, random_state=42)  
cus_spending['cluster'] = kmeans.fit_predict(scaled_data)  

print(cus_spending.head())

plt.figure(figsize=(8, 5))  
sns.scatterplot(data=cus_spending, x='total_spent', y='transaction_count', hue='cluster', palette='coolwarm')  
plt.xlabel("Total Spending")  
plt.ylabel("Transaction Count")  
plt.title("Customer Clusters Based on Spending Behavior")  
plt.legend(title="Cluster")  
plt.show()

In [None]:
#DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)  
cus_spending['dbscan_cluster'] = dbscan.fit_predict(scaled_data)  

print(cus_spending.head())

#### **3. Can we use association rule mining (Apriori, FP-Growth) to find patterns in purchases?**

##### Data preparation

###### Association rule mining (Apriori, FP-Growth) tab effective hota hai jab categorical data ho, jese:
###### - Customer-wise purchased categories (cus_id aur category)
###### - Frequent itemsets jo ek transaction me saath purchase hote hain

In [None]:
# ye aik transaction kai andr purchase ki gae categories ko list main convert krega

data_tran = cre_data.groupby('trans_num')['category'].apply(list).reset_index()
print(data_tran.head())

In [None]:
# Apriori Algorithm (Finding Frequent Patterns) 

# Transaction Data Encode Karna  
te = TransactionEncoder()  
te_ary = te.fit(data_tran['category']).transform(data_tran['category'])  
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)  

# Frequent Itemsets with Apriori  
freq_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)  
print(freq_itemsets.head())  

# Association Rules Generate Karna  
rules = association_rules(freq_itemsets, metric="lift", min_threshold=1.0)  
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

In [None]:
# FP-Growth Algorithm (Faster Alternative to Apriori)

# Frequent Itemsets Using FP-Growth  
freq_itemsets_fp = fpgrowth(df_encoded, min_support=0.05, use_colnames=True)  
print(freq_itemsets_fp.head())  

# Generate Association Rules  
rules_fp = association_rules(freq_itemsets_fp, metric="lift", min_threshold=1.0)  
print(rules_fp[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

###### Interpretation of Results
###### - Agar rule {"Electronics"} → {"Accessories"} hai, aur confidence 80% hai, iska matlab jo log Electronics le rahe hain, unka 80% chance hai Accessories lene ka bhi.
###### - Lift value agar >1 hai, toh iska matlab strong association hai.

#### **4. How does spending behavior correlate with economic factors (e.g., inflation, interest rates)?**

##### We cannot analyze the correlation between spending behavior and economic factors (such as inflation and interest rates) because we do not have
##### the necessary economic data. Correlation analysis requires historical data on economic indicators like inflation rates, interest rates, and GDP
##### growth. Without this data, it is not possible to establish a statistical relationship between spending patterns and broader economic trends.

#### **5. Can we use NLP on transaction descriptions to classify transactions more effectively?**

##### Yes, NLP can be used on transaction descriptions to classify transactions more effectively. If the dataset contains a transaction description column (e.g., merchant names, product details, or transaction notes), we can apply Natural Language Processing (NLP) techniques to categorize them.

##### Approach:
#####  - Preprocess text data (tokenization, stopword removal, stemming/lemmatization).
#####  - Convert text into numerical format (TF-IDF, Word Embeddings).
#####  - Train a classification model (Naïve Bayes, Random Forest, or Deep Learning).
#####  - Use clustering (e.g., K-Means) for unsupervised classification if labels are unavailable.

##### Why Use NLP?
##### - Helps in automated categorization of transactions.
##### - Improves fraud detection by identifying unusual transaction patterns.
##### - Can identify spending patterns (e.g., grocery, travel, luxury).