In [None]:
import numpy as np
import pandas as pd

In [None]:
file_paths = {
    'betweenness_centrality': 'path/to/betweenness_centrality.csv',
    'cash_transactions': 'path/to/cash_trxns.csv',
    'degree_centrality': 'path/to/degree_centrality.csv',
    'eigenvector_centrality': 'path/to/eigenvector_centrality.csv',
    'emt_transactions': 'path/to/emt_trxns.csv',
    'kyc_data': 'path/to/kyc.csv',
    'pagerank_centrality': 'path/to/pagerank.csv',
    'wire_transactions': 'path/to/wire_trxns.csv'
}

# Load datasets
datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Clean and preprocess data
for key, df in datasets.items():
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)
    df.columns = df.columns.str.replace(' ', '_').str.lower()
    datasets[key] = df

In [None]:
def aggregate_transactions_with_country_count(df, transaction_type):
    if transaction_type == 'wire':
        sender_agg = df.groupby('id_sender').agg(
            total_wire_value=('wire_value', 'sum'),
            wire_sent_count=('wire_value', 'count')
        ).reset_index()
        sender_agg['unique_countries_count'] = df.groupby('id_sender')['country_receiver'].nunique().reset_index(drop=True)

        receiver_agg = df.groupby('id_receiver').agg(
            total_wire_value=('wire_value', 'sum'),
            wire_received_count=('wire_value', 'count')
        ).reset_index()
        receiver_agg['unique_countries_count'] = df.groupby('id_receiver')['country_sender'].nunique().reset_index(drop=True)

        return sender_agg, receiver_agg
    else:
        # For cash and EMT transactions, set unique countries count to 1
        if transaction_type == 'cash':
            agg_df = df.groupby('cust_id').agg(
                total_value=('value', 'sum'),
                transaction_count=('value', 'count')
            ).reset_index()
        else:  # EMT transactions
            agg_df = df.groupby('id_sender').agg(
                total_emt_value=('emt_value', 'sum'),
                emt_sent_count=('emt_value', 'count')
            ).reset_index()

            receiver_agg = df.groupby('id_receiver').agg(
                total_emt_value=('emt_value', 'sum'),
                emt_received_count=('emt_value', 'count')
            ).reset_index()

            agg_df['unique_countries_count'] = 1
            receiver_agg['unique_countries_count'] = 1

            return agg_df, receiver_agg

        agg_df['unique_countries_count'] = 1
        return agg_df, None

# Aggregate transactions with country count
cash_agg, _ = aggregate_transactions_with_country_count(datasets['cash_transactions'], 'cash')
emt_sender_agg, emt_receiver_agg = aggregate_transactions_with_country_count(datasets['emt_transactions'], 'emt')
wire_sender_agg, wire_receiver_agg = aggregate_transactions_with_country_count(datasets['wire_transactions'], 'wire')

def prepare_for_merge(aggregated_df, transaction_type_prefix, role):
    exclude_from_renaming = ['cust_id', 'id_sender', 'id_receiver']
    renamed_columns = {
        col: f"{transaction_type_prefix}_{role}_{col}" if col not in exclude_from_renaming else col
        for col in aggregated_df.columns
    }
    return aggregated_df.rename(columns=renamed_columns)

prepared_cash_agg = prepare_for_merge(cash_agg, 'cash', 'sender')  # Adjust based on your data structure

prepared_emt_sender_agg = prepare_for_merge(emt_sender_agg, 'emt', 'sender')
prepared_emt_receiver_agg = prepare_for_merge(emt_receiver_agg, 'emt', 'receiver')

prepared_wire_sender_agg = prepare_for_merge(wire_sender_agg, 'wire', 'sender')
prepared_wire_receiver_agg = prepare_for_merge(wire_receiver_agg, 'wire', 'receiver')

In [None]:
kyc_cent_df = datasets['kyc_data']
for key, cent_df in centrality_dicts.items():
    kyc_cent_df = kyc_cent_df.merge(cent_df, how='left', left_on='cust_id', right_on='vertex', suffixes=('', f'_cent_{key}'))
    kyc_cent_df.drop('vertex', axis=1, inplace=True)

final_df = kyc_cent_df.copy()

# Function to join datasets on 'cust_id', handling both sender and receiver cases
def join_on_cust_id(base_df, new_df, id_col):
    return base_df.merge(new_df, how='left', left_on='cust_id', right_on=id_col)

# Join aggregated transaction data with the base dataset
for df, id_col in [(prepared_cash_agg, 'cust_id'),
                   (prepared_emt_sender_agg, 'id_sender'),
                   (prepared_emt_receiver_agg, 'id_receiver'),
                   (prepared_wire_sender_agg, 'id_sender'),
                   (prepared_wire_receiver_agg, 'id_receiver')]:
    final_df = join_on_cust_id(final_df, df, id_col)

# Drop duplicate or unnecessary id columns if any
final_df.drop(['id_sender', 'id_receiver'], axis=1, errors='ignore', inplace=True)

# Save the final DataFrame
final_df.to_csv('path/to/final_joined_transactions_with_kyc_and_centrality.csv', index=False)

In [None]:
numeric_cols = final_df.select_dtypes(include=[np.number]).columns
final_df[numeric_cols] = final_df[numeric_cols].fillna(0)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Example using TF-IDF and K-Means clustering
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(final_df['occupation'])

num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
final_df['occupation_cluster'] = kmeans.fit_predict(X)

print(final_df['occupation_cluster'].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the 'gender' column
final_df['gender_encoded'] = label_encoder.fit_transform(final_df['gender'])

# Verify the transformation
print(final_df[['gender', 'gender_encoded']].head())

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Select numerical columns for standardization
numerical_cols = final_df.select_dtypes(include=['int64', 'float64']).columns

# Fit and transform the numerical data
final_df[numerical_cols] = scaler.fit_transform(final_df[numerical_cols])

# Check the standardized data
print(final_df[numerical_cols].head())

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
X = final_df.select_dtypes(include=['float64', 'int64'])

In [None]:
# Initialize the Isolation Forest model
iso_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)

# Fit the model
iso_forest.fit(X)

In [None]:
anomalies = iso_forest.predict(X)
final_df['anomaly'] = anomalies
print("Number of anomalies detected:", list(anomalies).count(-1))

In [None]:
anomalies_df = final_df[final_df['anomaly'] == -1]

# Explore the anomalies
print(anomalies_df.head())

In [None]:
numerical_cols = final_df.select_dtypes(include=['float64', 'int64']).columns

scaler.fit(final_df[numerical_cols])

final_df[numerical_cols] = scaler.inverse_transform(final_df[numerical_cols])