# KAIM Week 8 and 9 Challenges

## **Task 1: Data Analysis and Preprocessing**

1. Handle Missing Values
    - Imputeordropmissing values
 
2. Data Cleaning
    - Removeduplicates
    - Correct data types

3. Exploratory Data Analysis (EDA)
    - Univariate analysis
    - Bivariate analysis

4. MergeDatasets for Geolocation Analysis
    - ConvertIPaddresses to integer format
    - MergeFraud_Data.csv with IpAddress_to_Country.csv

5. Feature Engineering
    - Transaction frequency and velocity for Fraud_Data.csv
    - Time-Basedfeatures for Fraud_Data.csv
         i. hour_of _day
         ii. day_of_week

6. Normalization and Scaling

7. EncodeCategorical Features

## Import Necessary Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings('ignore')

# Set plot style for better visuals
sns.set(style="whitegrid")

## Load Datasets

In [None]:
# Load the datasets
fraud_data = pd.read_csv('/kaggle/input/kaim-w89/Fraud_Data.csv')
ip_data = pd.read_csv('/kaggle/input/kaim-w89/IpAddress_to_Country.csv')
credit_data = pd.read_csv('/kaggle/input/kaim-w89/creditcard.csv')

## Data Overview

In [None]:
fraud_data.head()

In [None]:
ip_data.head()

In [None]:
credit_data.head()

In [None]:
fraud_data.info()

In [None]:
ip_data.info()

In [None]:
credit_data.info()

## Handling Missing Values

In [None]:
# Check for missing values in fraud_data
print(fraud_data.isnull().sum())

In [None]:
# Check credit_data missing values
print(credit_data.isnull().sum())

In [None]:
# Check ip_address missing values
print(ip_data.isnull().sum())

**No Missing value to handle**

## Data Cleaning

### Duplicate Vaues

In [None]:
# Remove duplicates
fraud_data = fraud_data.drop_duplicates()
credit_data = credit_data.drop_duplicates()

# Confirm if duplicates are removed
print(f"Remaining duplicates in fraud_data: {fraud_data.duplicated().sum()}")
print(f"Remaining duplicates in credit_data: {credit_data.duplicated().sum()}")

## Correct Data Type

In [None]:
# Check and convert data types
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Ensure categorical features are properly encoded
fraud_data['source'] = fraud_data['source'].astype('category')
fraud_data['browser'] = fraud_data['browser'].astype('category')
fraud_data['sex'] = fraud_data['sex'].astype('category')

In [None]:
# Confirm data types
fraud_data.info()

## Exploratory Data Analysis (EDA)

### Univariant Analysis

In [None]:
# Distribution of purchase values
plt.figure(figsize=(10, 6))
sns.histplot(fraud_data['purchase_value'], bins=50, kde=True)
plt.title('Distribution of Purchase Values')
plt.show()

# Distribution of user age
plt.figure(figsize=(10, 6))
sns.histplot(fraud_data['age'], bins=20, kde=True)
plt.title('Distribution of User Age')
plt.show()

# Countplot for fraud detection
plt.figure(figsize=(8, 5))
sns.countplot(x='class', data=fraud_data)
plt.title('Fraud vs Non-Fraud Transactions')
plt.show()

### Bivariant Analysis

In [None]:
# Relationship between purchase value and fraud
plt.figure(figsize=(10, 6))
sns.boxplot(x='class', y='purchase_value', data=fraud_data)
plt.title('Purchase Value by Fraud Class')
plt.show()

# Relationship between user age and fraud
plt.figure(figsize=(10, 6))
sns.boxplot(x='class', y='age', data=fraud_data)
plt.title('Age by Fraud Class')
plt.show()

### Fraud by Browser

In [None]:
# Browser and fraud analysis
sns.countplot(x='browser', hue='class', data=fraud_data)
plt.title('Fraud by Browser')
plt.xticks(rotation=45)
plt.show()

### Multivariant Analysis

In [None]:
# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()

fraud_data['source'] = label_encoder.fit_transform(fraud_data['source'])
fraud_data['browser'] = label_encoder.fit_transform(fraud_data['browser'])
fraud_data['sex'] = label_encoder.fit_transform(fraud_data['sex'])


# Check the dataset after encoding
fraud_data.info()

### Correlation

In [None]:
# Correlation matrix for fraud_data

plt.figure(figsize=(10,8))
sns.heatmap(fraud_data.drop(columns=['device_id', 'signup_time', 'purchase_time']).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix for E-commerce Data')
plt.show()

## Pair-plot

In [None]:
# Pairplot for the credit card dataset
sns.pairplot(credit_data[['V1', 'V2', 'V3', 'Amount', 'Class']], hue='Class')
plt.show()

## Outlier Detection

In [None]:
# Boxplot for detecting outliers in purchase_value
plt.figure(figsize=(8, 6))
sns.boxplot(data=fraud_data, x='class', y='purchase_value')
plt.title('Purchase Value Outliers by Fraud Class')
plt.show()

# Outlier detection in credit card data (Amount)
plt.figure(figsize=(8, 6))
sns.boxplot(data=credit_data, x='Class', y='Amount')
plt.title('Amount Outliers in Credit Card Data')
plt.show()

## Class Imbalance Check

In [None]:
# Class distribution in e-commerce data
sns.countplot(x='class', data=fraud_data)
plt.title('Class Distribution (Fraud vs Non-Fraud)')
plt.show()

# Class distribution in credit card data
sns.countplot(x='Class', data=credit_data)
plt.title('Class Distribution in Credit Card Data')
plt.show()

## Merge Dataset for Geolocation Analysis

### Convert IP Address into Integer format

In [None]:
def convert_ip_to_int(ip_address):
    try:
        # Ensure IP address is a valid string before conversion
        if isinstance(ip_address, str):
            return int(ip_address.replace('.', ''))
        else:
            return None
    except Exception as e:
        print(f"Error converting IP address {ip_address}: {e}")
        return None

In [None]:
 # Ensure 'ip_address' is treated as a string to handle missing values
fraud_data['ip_address'] = fraud_data['ip_address'].astype(str)

# Convert IP addresses to integer format, handling errors
fraud_data['ip_address'] = fraud_data['ip_address'].apply(lambda x: convert_ip_to_int(x) if x != 'nan' else None)

In [None]:
fraud_data.head()

In [None]:
# Ensure both 'lower_bound_ip_address' and 'upper_bound_ip_address' in ip_data are strings
ip_data['lower_bound_ip_address'] = ip_data['lower_bound_ip_address'].astype(str)
ip_data['upper_bound_ip_address'] = ip_data['upper_bound_ip_address'].astype(str)

# Convert the IP addresses in the IP-to-country dataset to integer format
ip_data['lower_bound_ip_addres'] = ip_data['lower_bound_ip_address'].apply(lambda x: convert_ip_to_int(x) if x != 'nan' else None)
ip_data['upper_bound_ip_adress'] = ip_data['upper_bound_ip_address'].apply(lambda x: convert_ip_to_int(x) if x != 'nan' else None)

In [None]:
ip_data = ip_data.iloc[:, 2:]
ip_data.head()

### Merge Fraud Dataset with IP Dataset

In [None]:
merged_df = pd.concat([fraud_data, ip_data], axis = 1)
merged_df.head()

In [None]:
fraud_data.shape, ip_data.shape

In [None]:
merged_df.isnull().sum()

In [None]:
fraud_data_combined = merged_df.dropna()

In [None]:
merged_df.shape, fraud_data_combined.shape

## Fraud Rate by country

In [None]:
# Barplot for transaction distribution by country
plt.figure(figsize=(12, 6))
country_fraud = fraud_data_combined.groupby('country')['class'].mean().reset_index()
sns.barplot(x='country', y='class', data=country_fraud)
plt.title('Fraud Rate by Country')
plt.xticks(rotation=90)
plt.show()

### Time Distribution of Credit Card Transaction

In [None]:
# Time analysis in credit card data
sns.histplot(credit_data, x='Time', hue='Class', multiple='stack', bins=50)
plt.title('Time Distribution of Credit Card Transactions')
plt.show()

## Feature Engineering

### Tansaction Frequency and Velocity

In [None]:
# Calculate transaction frequency and velocity
fraud_data_combined['signup_purchase_diff'] = (fraud_data_combined['purchase_time'] - fraud_data_combined['signup_time']).dt.total_seconds()

# Calculate total transactions per user
fraud_data_combined['transaction_count'] = fraud_data_combined.groupby('user_id')['user_id'].transform('count')

### Time Based Features

In [None]:
# Extract hour of the day and day of the week
fraud_data_combined['hour_of_day'] = fraud_data_combined['purchase_time'].dt.hour
fraud_data_combined['day_of_week'] = fraud_data_combined['purchase_time'].dt.dayofweek

# Visualize time-based features
plt.figure(figsize=(12, 6))
sns.countplot(x='hour_of_day', hue='class', data=fraud_data_combined)
plt.title('Transaction Hour vs Fraud Class')
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(x='day_of_week', hue='class', data=fraud_data_combined)
plt.title('Transaction Day of Week vs Fraud Class')
plt.show()

## Time-series Analysis (hourly basis)

In [None]:
sns.histplot(fraud_data_combined, x='hour_of_day', hue='class', multiple='stack', bins=24)
plt.title('Hourly Fraud vs Non-Fraud Transactions')
plt.show()

## Normalization and Scaling

In [None]:
# Normalize the transaction amount and signup_purchase_diff
scaler = StandardScaler()

fraud_data_combined[['purchase_value', 'signup_purchase_diff']] = scaler.fit_transform(fraud_data_combined[['purchase_value', 'signup_purchase_diff']])

## Encode Categorical Features

In [None]:
# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()

fraud_data_combined['source'] = label_encoder.fit_transform(fraud_data_combined['source'])
fraud_data_combined['browser'] = label_encoder.fit_transform(fraud_data_combined['browser'])
fraud_data_combined['sex'] = label_encoder.fit_transform(fraud_data_combined['sex'])
fraud_data_combined['country'] = label_encoder.fit_transform(fraud_data_combined['country'])

# Check the dataset after encoding
fraud_data_combined.head()

In [None]:
fraud_data_combined.shape

In [None]:
fraud_data_combined.info()

In [None]:
fraud_data_combined['country'].nunique()

In [None]:
fraud_data_combined['device_id'].nunique()

In [None]:
fraud_data_combined.columns

## Feature Importance (Preliminary Model Insights)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Prepare data for the model (e-commerce)
X = fraud_data_combined.drop(columns=['class', 'device_id', 'signup_time', 'purchase_time'])
y = fraud_data_combined['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importance
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
feature_importance.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importances')
plt.show()

## Anomaly Detection

In [None]:
from sklearn.ensemble import IsolationForest

# Fit an Isolation Forest to detect anomalies
iso_forest = IsolationForest(contamination=0.01)
fraud_data_combined['anomaly_score'] = iso_forest.fit_predict(fraud_data_combined.drop(columns=['class', 'device_id', 'signup_time', 'purchase_time']))

# Visualize anomalies
sns.countplot(x='anomaly_score', hue='class', data=fraud_data_combined)
plt.title('Anomalies Detected vs True Fraud Class')
plt.show()