## Importing required libraries

In [8]:
# Basic libraries for data handling and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Statistical test
import scipy.stats as stats

# Utility
from copy import deepcopy

import warnings
warnings.filterwarnings('ignore')

## Loading and understanding data

In [9]:
# Load data
df = pd.read_csv("D:\\DS Course\\dataset\\Credit_card.csv")

# Initial inspection
df.drop('index', axis=1, inplace=True)
df['Amount'] = df['Amount'] / 1000
df['Amount'] = df['Amount'].astype('float32')
df.rename(columns={'Amount': 'Amount (K)'}, inplace=True)
df['Date'] = pd.to_datetime(df['Date'])

# Dataset summary
print(df.describe())
print(df.info())

                                Date    Amount (K)
count                          26052  26052.000000
mean   2014-07-30 11:34:44.385075968    156.411530
min              2013-10-04 00:00:00      1.005000
25%              2014-03-02 00:00:00     77.120251
50%              2014-08-02 00:00:00    153.106499
75%              2014-12-28 00:00:00    228.050003
max              2015-05-26 00:00:00    998.077026
std                              NaN    103.063156
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26052 entries, 0 to 26051
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   City        26052 non-null  object        
 1   Date        26052 non-null  datetime64[ns]
 2   Card Type   26052 non-null  object        
 3   Exp Type    26052 non-null  object        
 4   Gender      26052 non-null  object        
 5   Amount (K)  26052 non-null  float32       
dtypes: datetime64[ns](1), float32(1), object(4)


##  Clean City Column & Filter Cities

In [10]:
df['City'] = df['City'].apply(lambda x: x.split(',')[0])

# Filter selected cities
selected_cities = [
    'Delhi', 'Greater Mumbai', 'Bengaluru', 'Ahmedabad', 'Kolkata', 'Pune', 'Chennai',
    'Hyderabad', 'Jaipur', 'Surat', 'Indore', 'Jamalpur', 'Udaipurwati', 'Palanpur',
    'Muzaffarpur', 'Taranagar', 'Ambikapur', 'Mundi', 'Padrauna'
]
df_citfilt = df[df['City'].isin(selected_cities)]

# Save cleaned dataset
df_citfilt.to_csv('D:\\DS Course\\Credit Card Data Analysis\\CC_Cleaned Data.csv', index=False)

## Add Year & Filter Major Cities and the most recent data in the dataset

In [11]:
# Focus on major metro cities
maj_cities = ['Delhi', 'Greater Mumbai', 'Bengaluru', 'Ahmedabad', 'Kolkata', 'Pune',
              'Chennai', 'Hyderabad', 'Jaipur', 'Surat', 'Indore']
df_major = df[df['City'].isin(maj_cities)]
df_major['Year of Transaction'] = df_major['Date'].dt.year.astype(int)

# Focus on a specific year (e.g., 2015)
df_major_2015 = df_major[df_major['Year of Transaction'] == 2015]

## Chi-Square Tests for Associations

In [12]:
# Gender vs Card Type | Gender vs Exp Type | Card Type vs Exp Type
cits = list(df_major['City'].unique())
pd.set_option('display.max_rows', None)

for city in cits:
    print('City:', city)

    # Gender vs Card Type
    tab1 = pd.crosstab(df_major[df_major['City'] == city]['Gender'], 
                       df_major[df_major['City'] == city]['Card Type'])
    _, p1, _, expected1 = stats.chi2_contingency(tab1)
    if (expected1 > 5).all():
        print("Gender vs Card Type - p:", p1)
        print("→ Dependent" if p1 < 0.05 else "→ Independent")
    else:
        print("→ Chi-square test not feasible")
    
    # Gender vs Exp Type
    tab2 = pd.crosstab(df_major[df_major['City'] == city]['Gender'], 
                       df_major[df_major['City'] == city]['Exp Type'])
    _, p2, _, expected2 = stats.chi2_contingency(tab2)
    if (expected2 > 5).all():
        print("Gender vs Exp Type - p:", p2)
        print("→ Dependent" if p2 < 0.05 else "→ Independent")
    else:
        print("→ Chi-square test not feasible")

    # Card Type vs Exp Type
    tab3 = pd.crosstab(df_major[df_major['City'] == city]['Card Type'], 
                       df_major[df_major['City'] == city]['Exp Type'])
    _, p3, _, expected3 = stats.chi2_contingency(tab3)
    if (expected3 > 5).all():
        print("Card Type vs Exp Type - p:", p3)
        print("→ Dependent" if p3 < 0.05 else "→ Independent")
    else:
        print("→ Chi-square test not feasible")

    print("-" * 50)

City: Delhi
Gender vs Card Type - p: 0.014406413201986578
→ Dependent
Gender vs Exp Type - p: 0.0032564765590939308
→ Dependent
Card Type vs Exp Type - p: 0.8073711347632427
→ Independent
--------------------------------------------------
City: Greater Mumbai
Gender vs Card Type - p: 3.3786811179547874e-05
→ Dependent
Gender vs Exp Type - p: 1.0050296977112737e-08
→ Dependent
Card Type vs Exp Type - p: 0.7074206696656069
→ Independent
--------------------------------------------------
City: Bengaluru
Gender vs Card Type - p: 1.5464850249929327e-05
→ Dependent
Gender vs Exp Type - p: 4.72944684472417e-06
→ Dependent
Card Type vs Exp Type - p: 0.15410367642225842
→ Independent
--------------------------------------------------
City: Ahmedabad
Gender vs Card Type - p: 0.013725208534738857
→ Dependent
Gender vs Exp Type - p: 2.442729993770186e-07
→ Dependent
Card Type vs Exp Type - p: 0.820335466240427
→ Independent
--------------------------------------------------
City: Pune
Gender vs Ca