In [3]:
import pandas as pd
import os
import glob

# Specify the directories for development and validation data
dev_data_dir = r"/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3"
val_data_dir = r"/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3"

# Function to load all CSV files from a directory and concatenate them into a single DataFrame
def load_all_csvs_from_directory(directory_path):
    all_files = glob.glob(os.path.join(directory_path, "*.csv"))  # Get all CSV files in the directory
    df_list = [pd.read_csv(file) for file in all_files]  # Read each file and add it to the list
    combined_df = pd.concat(df_list, ignore_index=True)  # Concatenate all DataFrames
    return combined_df

# Load the development data
try:
    dev_df = load_all_csvs_from_directory(dev_data_dir)
    print("Development Data Loaded Successfully.")
    print(dev_df.head())
except Exception as e:
    print(f"Error loading development data: {e}")

# Load the validation data
try:
    val_df = load_all_csvs_from_directory(val_data_dir)
    print("Validation Data Loaded Successfully.")
    print(val_df.head())
except Exception as e:
    print(f"Error loading validation data: {e}")


Development Data Loaded Successfully.
   account_number  bad_flag  onus_attribute_1  transaction_attribute_1  \
0             1.0       0.0               NaN                      NaN   
1             2.0       0.0          221000.0                      0.0   
2             3.0       0.0           25000.0                      0.0   
3             4.0       0.0           86000.0                      0.0   
4             5.0       0.0          215000.0                      0.0   

   transaction_attribute_2  transaction_attribute_3  transaction_attribute_4  \
0                      NaN                      NaN                      NaN   
1                      0.0                      0.0                      0.0   
2                      0.0                      0.0                      0.0   
3                      0.0                      0.0                      0.0   
4                      0.0                      0.0                      0.0   

   transaction_attribute_5  transact

In [5]:
import pandas as pd
import os
import glob

# Specify the directories for development and validation data
dev_data_path = r"/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3"
val_data_path = r"/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3"

# Function to load and concatenate all CSV files from a directory
def load_all_csvs_from_directory(directory_path):
    all_files = glob.glob(os.path.join(directory_path, "*.csv"))
    if not all_files:
        raise FileNotFoundError(f"No CSV files found in the directory: {directory_path}")
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Load the development data
try:
    dev_df = load_all_csvs_from_directory(dev_data_path)
    pd.set_option('display.max_columns', None)
    print("Development Data Head (First 2 Rows, All Columns):")
    print(dev_df.head(2))
    print("\n")
except FileNotFoundError as e:
    print(e)

# Load the validation data
try:
    val_df = load_all_csvs_from_directory(val_data_path)
    pd.set_option('display.max_columns', None)
    print("Validation Data Head (First 2 Rows, All Columns):")
    print(val_df.head(2))
except FileNotFoundError as e:
    print(e)


Development Data Head (First 2 Rows, All Columns):
   account_number  bad_flag  onus_attribute_1  transaction_attribute_1  \
0             1.0       0.0               NaN                      NaN   
1             2.0       0.0          221000.0                      0.0   

   transaction_attribute_2  transaction_attribute_3  transaction_attribute_4  \
0                      NaN                      NaN                      NaN   
1                      0.0                      0.0                      0.0   

   transaction_attribute_5  transaction_attribute_6  transaction_attribute_7  \
0                      NaN                      NaN                      NaN   
1                      0.0                      0.0                      0.0   

   transaction_attribute_8  transaction_attribute_9  transaction_attribute_10  \
0                      NaN                      NaN                       NaN   
1                      0.0                      0.0                       0.0   



In [None]:

# Load the development and validation data, handling potential errors
try:
    dev_df = pd.read_csv(dev_data_path)
    print("=== Development Data Info ===")
    print(dev_df.info(verbose=True, show_counts=True))
except FileNotFoundError:
    print(f"Error: Development data file not found at '{dev_data_path}'")


try:
    val_df = pd.read_csv(val_data_path)
    print("\n=== Validation Data Info ===")
    print(val_df.info(verbose=True, show_counts=True))
except FileNotFoundError:
    print(f"Error: Validation data file not found at '{val_data_path}'")


# Calculate and print missing values for the development data.
print("\nMissing Values (Development Data)")
print(dev_df.isnull().sum().sort_values(ascending=False))


# Calculate and print missing values for the validation data.
print("\nMissing Values (Validation Data)")
print(val_df.isnull().sum().sort_values(ascending=False))

=== Development Data Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34511 entries, 0 to 34510
Data columns (total 1216 columns):
 #     Column                     Non-Null Count  Dtype  
---    ------                     --------------  -----  
 0     account_number             34511 non-null  int64  
 1     bad_flag                   34511 non-null  int64  
 2     onus_attribute_1           25655 non-null  float64
 3     transaction_attribute_1    25655 non-null  float64
 4     transaction_attribute_2    25655 non-null  float64
 5     transaction_attribute_3    25655 non-null  float64
 6     transaction_attribute_4    25655 non-null  float64
 7     transaction_attribute_5    25655 non-null  float64
 8     transaction_attribute_6    25655 non-null  float64
 9     transaction_attribute_7    25655 non-null  float64
 10    transaction_attribute_8    25655 non-null  float64
 11    transaction_attribute_9    25655 non-null  float64
 12    transaction_attribute_10   25655 non-nul

In [6]:
import pandas as pd
import os
import glob

# Specify the directories for development and validation data
dev_data_path = r"/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3"
val_data_path = r"/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3"

# Function to load and concatenate all CSV files from a directory
def load_all_csvs_from_directory(directory_path):
    all_files = glob.glob(os.path.join(directory_path, "*.csv"))
    if not all_files:
        raise FileNotFoundError(f"No CSV files found in the directory: {directory_path}")
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Load the development data and print detailed info
try:
    dev_df = load_all_csvs_from_directory(dev_data_path)
    print("=== Development Data Info ===")
    print(dev_df.info(verbose=True, show_counts=True))
except FileNotFoundError as e:
    print(e)

# Load the validation data and print detailed info
try:
    val_df = load_all_csvs_from_directory(val_data_path)
    print("\n=== Validation Data Info ===")
    print(val_df.info(verbose=True, show_counts=True))
except FileNotFoundError as e:
    print(e)

# Calculate and print missing values for the development data
if 'dev_df' in locals():
    print("\nMissing Values (Development Data):")
    print(dev_df.isnull().sum().sort_values(ascending=False))

# Calculate and print missing values for the validation data
if 'val_df' in locals():
    print("\nMissing Values (Validation Data):")
    print(val_df.isnull().sum().sort_values(ascending=False))


=== Development Data Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96806 entries, 0 to 96805
Data columns (total 2081 columns):
 #     Column                     Non-Null Count  Dtype  
---    ------                     --------------  -----  
 0     account_number             34511 non-null  float64
 1     bad_flag                   34511 non-null  float64
 2     onus_attribute_1           25655 non-null  float64
 3     transaction_attribute_1    25655 non-null  float64
 4     transaction_attribute_2    25655 non-null  float64
 5     transaction_attribute_3    25655 non-null  float64
 6     transaction_attribute_4    25655 non-null  float64
 7     transaction_attribute_5    25655 non-null  float64
 8     transaction_attribute_6    25655 non-null  float64
 9     transaction_attribute_7    25655 non-null  float64
 10    transaction_attribute_8    25655 non-null  float64
 11    transaction_attribute_9    25655 non-null  float64
 12    transaction_attribute_10   25655 non-nul

In [7]:
import pandas as pd
import os
import glob

# Specify the directories for development and validation data
dev_data_path = r"/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3"
val_data_path = r"/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3"

# Function to load and concatenate all CSV files from a directory
def load_all_csvs_from_directory(directory_path):
    all_files = glob.glob(os.path.join(directory_path, "*.csv"))
    if not all_files:
        raise FileNotFoundError(f"No CSV files found in the directory: {directory_path}")
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Load the development and validation data
try:
    dev_df = load_all_csvs_from_directory(dev_data_path)
    val_df = load_all_csvs_from_directory(val_data_path)
except FileNotFoundError as e:
    print(e)
    exit()

# Drop empty columns
empty_cols = ['bureau_447', 'bureau_436']
for col in empty_cols:
    if col in dev_df:
        dev_df = dev_df.drop(columns=col)
    if col in val_df:
        val_df = val_df.drop(columns=col)

# Determine the threshold for non-null values
threshold_non_null = 20000

# Identify and drop columns with too many missing values
columns_to_drop_dev = dev_df.columns[dev_df.isnull().sum() > (len(dev_df) - threshold_non_null)]
columns_to_drop_val = val_df.columns[val_df.isnull().sum() > (len(val_df) - threshold_non_null)]

dev_df = dev_df.drop(columns=columns_to_drop_dev)
val_df = val_df.drop(columns=columns_to_drop_val)

# Impute remaining missing values with median
for column in dev_df.columns:
    if dev_df[column].isnull().any():
        median_value = dev_df[column].median()
        dev_df[column] = dev_df[column].fillna(median_value)

for column in val_df.columns:
    if val_df[column].isnull().any():
        median_value = val_df[column].median()
        val_df[column] = val_df[column].fillna(median_value)

# Check the class distribution of 'bad_flag'
if 'bad_flag' in dev_df:
    class_counts = dev_df['bad_flag'].value_counts()
    class_percentages = dev_df['bad_flag'].value_counts(normalize=True) * 100

    print("Class Distribution of 'bad_flag' (Development Data):")
    print(class_counts)
    print("\nClass Percentages of 'bad_flag' (Development Data):")
    print(class_percentages)
else:
    print("Error: 'bad_flag' column not found in the development data.")


Class Distribution of 'bad_flag' (Development Data):
bad_flag
0.0    96318
1.0      488
Name: count, dtype: int64

Class Percentages of 'bad_flag' (Development Data):
bad_flag
0.0    99.495899
1.0     0.504101
Name: proportion, dtype: float64


In [8]:
import pandas as pd
import os
import glob
from imblearn.over_sampling import SMOTE

# Specify the directories for development and validation data
dev_data_path = r"/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3"
val_data_path = r"/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3"

# Function to load and concatenate all CSV files from a directory
def load_all_csvs_from_directory(directory_path):
    all_files = glob.glob(os.path.join(directory_path, "*.csv"))
    if not all_files:
        raise FileNotFoundError(f"No CSV files found in the directory: {directory_path}")
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Load the development and validation data
try:
    dev_df = load_all_csvs_from_directory(dev_data_path)
    val_df = load_all_csvs_from_directory(val_data_path)
except FileNotFoundError as e:
    print(e)
    exit()

# Drop empty columns
empty_cols = ['bureau_447', 'bureau_436']
for col in empty_cols:
    if col in dev_df:
        dev_df = dev_df.drop(columns=col)
    if col in val_df:
        val_df = val_df.drop(columns=col)

# Determine the threshold for non-null values
threshold_non_null = 20000

# Identify and drop columns with too many missing values
columns_to_drop_dev = dev_df.columns[dev_df.isnull().sum() > (len(dev_df) - threshold_non_null)]
columns_to_drop_val = val_df.columns[val_df.isnull().sum() > (len(val_df) - threshold_non_null)]

dev_df = dev_df.drop(columns=columns_to_drop_dev)
val_df = val_df.drop(columns=columns_to_drop_val)

# Impute remaining missing values with median
for column in dev_df.columns:
    if dev_df[column].isnull().any():
        median_value = dev_df[column].median()
        dev_df[column] = dev_df[column].fillna(median_value)

for column in val_df.columns:
    if val_df[column].isnull().any():
        median_value = val_df[column].median()
        val_df[column] = val_df[column].fillna(median_value)

# Separate features and target variable
if 'bad_flag' in dev_df:
    X = dev_df.drop(columns=['bad_flag', 'account_number'])
    y = dev_df['bad_flag']
else:
    print("Error: 'bad_flag' column not found in the development data.")
    exit()

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new dataframe with the balanced dataset
dev_df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['bad_flag'])], axis=1)

# Display the counts and percentages of our rebalanced target variable
class_counts = dev_df_resampled['bad_flag'].value_counts()
class_percentages = dev_df_resampled['bad_flag'].value_counts(normalize=True) * 100

print("Class Distribution of 'bad_flag' (Oversampled Development Data):")
print(class_counts)
print("\nClass Percentages of 'bad_flag' (Oversampled Development Data):")
print(class_percentages)

# Display resampled data info
print("=== Resampled Development Data Info ===")
print(dev_df_resampled.info(verbose=True, show_counts=True))


Class Distribution of 'bad_flag' (Oversampled Development Data):
bad_flag
0.0    96318
1.0    96318
Name: count, dtype: int64

Class Percentages of 'bad_flag' (Oversampled Development Data):
bad_flag
0.0    50.0
1.0    50.0
Name: proportion, dtype: float64
=== Resampled Development Data Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192636 entries, 0 to 192635
Data columns (total 2010 columns):
 #     Column                     Non-Null Count   Dtype  
---    ------                     --------------   -----  
 0     onus_attribute_1           192636 non-null  float64
 1     transaction_attribute_1    192636 non-null  float64
 2     transaction_attribute_2    192636 non-null  float64
 3     transaction_attribute_3    192636 non-null  float64
 4     transaction_attribute_4    192636 non-null  float64
 5     transaction_attribute_5    192636 non-null  float64
 6     transaction_attribute_6    192636 non-null  float64
 7     transaction_attribute_7    192636 non-null  float64


In [9]:
import os
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel


# Define the directories
dev_data_dir = '/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3'
val_data_dir = '/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3'

# Get all CSV files in the directories
dev_data_files = [f for f in os.listdir(dev_data_dir) if f.endswith('.csv')]
val_data_files = [f for f in os.listdir(val_data_dir) if f.endswith('.csv')]

# Load the data from all CSVs in the dev_data_dir
dev_df_list = []
for file in dev_data_files:
    try:
        dev_df = pd.read_csv(os.path.join(dev_data_dir, file))
        dev_df_list.append(dev_df)
    except FileNotFoundError:
        print(f"Error: Could not find the file {file} in the development data directory")
        continue

# Load the data from all CSVs in the val_data_dir
val_df_list = []
for file in val_data_files:
    try:
        val_df = pd.read_csv(os.path.join(val_data_dir, file))
        val_df_list.append(val_df)
    except FileNotFoundError:
        print(f"Error: Could not find the file {file} in the validation data directory")
        continue

# Combine all loaded dataframes into one (if there are multiple)
dev_df = pd.concat(dev_df_list, ignore_index=True) if dev_df_list else None
val_df = pd.concat(val_df_list, ignore_index=True) if val_df_list else None

# Ensure that the data is loaded properly
if dev_df is None or val_df is None:
    print("Error: No data files were loaded.")
    exit()

# Drop Empty Columns
empty_cols = ['bureau_447', 'bureau_436']
dev_df = dev_df.drop(columns=empty_cols)
val_df = val_df.drop(columns=empty_cols)

# Determine the Threshold
threshold_non_null = 20000

# Identify the columns to drop from development data
columns_to_drop_dev = dev_df.columns[dev_df.isnull().sum() > (len(dev_df) - threshold_non_null)]
# Identify the columns to drop from validation data
columns_to_drop_val = val_df.columns[val_df.isnull().sum() > (len(val_df) - threshold_non_null)]

# Drop Columns From development data
dev_df = dev_df.drop(columns=columns_to_drop_dev)
# Drop Columns From validation data
val_df = val_df.drop(columns=columns_to_drop_val)

# Impute remaining missing values
for column in dev_df.columns:
    if dev_df[column].isnull().any():
        median_value = dev_df[column].median()
        dev_df[column] = dev_df[column].fillna(median_value)

for column in val_df.columns:
    if val_df[column].isnull().any():
        median_value = val_df[column].median()
        val_df[column] = val_df[column].fillna(median_value)

# Separate features and target variable
X = dev_df.drop(columns=['bad_flag', 'account_number'])
y = dev_df['bad_flag']

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Perform feature selection with Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_selector = SelectFromModel(estimator=rf_model, threshold='median')  # using the median importance value as the threshold
rf_selector.fit(X_resampled, y_resampled)

# Get the selected feature names
selected_features = X.columns[rf_selector.get_support()]

# Print the selected features
print("Selected Features using Random Forest Feature Importance:")
print(selected_features)

# Print how many features were selected
print(f"\n Number of Selected Features: {len(selected_features)}")

# Apply the feature selection on our training and validation dataframes
X_resampled_selected = rf_selector.transform(X_resampled)
val_df_selected = val_df[selected_features]

# Display info on the shape of the data after transformation
print("\n Shape of the Oversampled Development Data after Feature Selection:", X_resampled_selected.shape)
print("Shape of the Validation Data After Feature Selection:", val_df_selected.shape)

# Print info of new dataframe
dev_df_resampled = pd.concat([pd.DataFrame(X_resampled_selected), pd.DataFrame(y_resampled, columns=['bad_flag'])], axis=1)
print("\n === Resampled Development Data Info After Feature Selection ===")
print(dev_df_resampled.info(verbose=True, show_counts=True))
print("\n === Validation Data Info After Feature Selection ===")
print(val_df_selected.info(verbose=True, show_counts=True))


Selected Features using Random Forest Feature Importance:
Index(['onus_attribute_1', 'transaction_attribute_10',
       'transaction_attribute_11', 'transaction_attribute_12',
       'transaction_attribute_13', 'transaction_attribute_14',
       'transaction_attribute_15', 'transaction_attribute_16',
       'transaction_attribute_17', 'transaction_attribute_18',
       ...
       'bureau_enquiry_40', 'bureau_enquiry_41', 'bureau_enquiry_42',
       'bureau_enquiry_43', 'bureau_enquiry_44', 'bureau_enquiry_45',
       'bureau_enquiry_46', 'bureau_enquiry_48', 'bureau_enquiry_49',
       'bureau_enquiry_50'],
      dtype='object', length=1005)

 Number of Selected Features: 1005

 Shape of the Oversampled Development Data after Feature Selection: (192636, 1005)
Shape of the Validation Data After Feature Selection: (41792, 1005)

 === Resampled Development Data Info After Feature Selection ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192636 entries, 0 to 192635
Data columns (tota

In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Specify the file paths for the development and validation data
dev_data_dir = '/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3'
val_data_dir = '/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3'

# Load the data
try:
    dev_df = pd.read_csv(dev_data_path)
    val_df = pd.read_csv(val_data_path)
except FileNotFoundError:
    print("Error: Could not find the input files")
    exit()

# Determine the Threshold
threshold_non_null = 20000

# Identify the columns to drop from development data
columns_to_drop_dev = dev_df.columns[dev_df.isnull().sum() > (len(dev_df) - threshold_non_null)]
# Identify the columns to drop from validation data
columns_to_drop_val = val_df.columns[val_df.isnull().sum() > (len(val_df) - threshold_non_null)]

# Drop Columns From development data
dev_df = dev_df.drop(columns = columns_to_drop_dev)
# Drop Columns From validation data
val_df = val_df.drop(columns = columns_to_drop_val)

# Ensure there are no missing values in the target variable (y)
if dev_df['bad_flag'].isnull().sum() > 0:
    print(f"Warning: Missing values in target variable 'bad_flag'. Dropping rows with missing target values.")
    # Drop rows with missing target variable
    dev_df = dev_df.dropna(subset=['bad_flag'])

# Separate features and target variable
X = dev_df.drop(columns = ['bad_flag', 'account_number'])
y = dev_df['bad_flag']

# Impute remaining missing values in features (X)
for column in X.columns:
    if X[column].isnull().any():
        median_value = X[column].median()
        X[column] = X[column].fillna(median_value)

for column in val_df.columns:
    if val_df[column].isnull().any():
        median_value = val_df[column].median()
        val_df[column] = val_df[column].fillna(median_value)

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled, columns = X.columns)

# Perform feature selection with Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators = 100)
rf_selector = SelectFromModel(estimator=rf_model, threshold='median')  # using the median importance value as the threshold
rf_selector.fit(X_resampled, y_resampled)

# Get the selected feature names
selected_features = X.columns[rf_selector.get_support()]

# Apply the feature selection on our training and validation dataframes
X_resampled_selected = rf_selector.transform(X_resampled)
val_df_selected = val_df[selected_features]

# Drop Empty Columns, after feature selection
empty_cols = ['bureau_447', 'bureau_436']
X_resampled_selected = pd.DataFrame(X_resampled_selected, columns = selected_features)
for col in empty_cols:
    if col in X_resampled_selected:
        X_resampled_selected = X_resampled_selected.drop(columns=col)
    if col in val_df_selected:
        val_df_selected = val_df_selected.drop(columns=col)

# Split Training Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X_resampled_selected, y_resampled, test_size=0.2, random_state=42)

# Train a Logistic Regression model
logistic_model = LogisticRegression(random_state=42, solver = 'liblinear')
logistic_model.fit(X_train, y_train)

# Make Predictions on resampled testing set
y_pred_resampled = logistic_model.predict_proba(X_test)[:, 1]
# Make Predictions on our validation data
y_pred_validation = logistic_model.predict_proba(val_df_selected)[:, 1]

# Evaluate the model's performance using ROC AUC on test data
roc_auc_resampled = roc_auc_score(y_test, y_pred_resampled)

print("AUC-ROC on resampled development test data:", roc_auc_resampled)

# Create submission file
submission_df = pd.DataFrame({
    'account_number': val_df['account_number'],
    'predicted_probability': y_pred_validation
})

# Create output CSV
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' has been created")


NameError: name 'dev_data_path' is not defined

In [11]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier

# Paths to the data files
dev_data_path = r"/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3/part_aa.csv"
val_data_path = r"/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3/part_aa.csv"

# Load the data
dev_df = pd.read_csv(dev_data_path)
val_df = pd.read_csv(val_data_path)

# Drop columns with excessive missing values
threshold_non_null = 20000
columns_to_drop_dev = dev_df.columns[dev_df.isnull().sum() > (len(dev_df) - threshold_non_null)]
columns_to_drop_val = val_df.columns[val_df.isnull().sum() > (len(val_df) - threshold_non_null)]
dev_df = dev_df.drop(columns=columns_to_drop_dev)
val_df = val_df.drop(columns=columns_to_drop_val)

# Separate features and target variable
X = dev_df.drop(columns=['bad_flag', 'account_number'])
y = dev_df['bad_flag']

# Impute remaining missing values for development and validation data
for column in X.columns:
    if X[column].isnull().any():
        median_value = X[column].median()
        X[column] = X[column].fillna(median_value)

for column in val_df.columns:
    if val_df[column].isnull().any():
        median_value = val_df[column].median()
        val_df[column] = val_df[column].fillna(median_value)

# Apply SMOTE for class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Standardize the data
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
val_df_scaled = scaler.transform(val_df[X.columns])

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_scaled, y_resampled, test_size=0.2, random_state=42)

# Initialize the LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': [20, 30, 40],
    'max_depth': [-1, 10, 20, 30],
    'min_data_in_leaf': [20, 40, 60]
}

# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(lgbm_model, param_distributions=param_grid, n_iter=20, scoring='roc_auc', cv=3, random_state=42)
random_search.fit(X_train, y_train)

# Best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred_proba_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test = best_model.predict(X_test)

# Evaluate the model's performance
roc_auc = roc_auc_score(y_test, y_pred_proba_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print("\n=== Model Performance on Resampled Test Data ===")
print(f"AUC-ROC: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Make predictions on the validation data
y_pred_proba_validation = best_model.predict_proba(val_df_scaled)[:, 1]

# Create the submission file
submission_df = pd.DataFrame({
    'account_number': val_df['account_number'],
    'predicted_probability': y_pred_proba_validation
})
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' has been created.")


[LightGBM] [Info] Number of positive: 18105, number of negative: 18185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 240746
[LightGBM] [Info] Number of data points in the train set: 36290, number of used features: 1071
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498898 -> initscore=-0.004409
[LightGBM] [Info] Start training from score -0.004409
[LightGBM] [Info] Number of positive: 18105, number of negative: 18186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.109759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 240659
[LightGBM] [Info] Number of data points in the train set: 36291, number of used features: 1073
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498884 -> initscore=-0.004464
[LightGBM] [Info] Start training from score -0.004464
[Lig

In [4]:
import pandas as pd
import numpy as np
import os
import glob
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Specify the directories for development and validation data
dev_data_path = r"/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3"
val_data_path = r"/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3"

# Function to load and concatenate all CSV files from a directory
def load_all_csvs_from_directory(directory_path):
    all_files = glob.glob(os.path.join(directory_path, "*.csv"))
    if not all_files:
        raise FileNotFoundError(f"No CSV files found in the directory: {directory_path}")
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Load the development data
try:
    dev_df = load_all_csvs_from_directory(dev_data_path)
    print("Development Data Loaded Successfully.")
except FileNotFoundError as e:
    print(e)
    exit()

# Load the validation data
try:
    val_df = load_all_csvs_from_directory(val_data_path)
    print("Validation Data Loaded Successfully.")
except FileNotFoundError as e:
    print(e)
    exit()

# Drop empty columns
empty_cols = ['bureau_447', 'bureau_436']
dev_df = dev_df.drop(columns = empty_cols)
val_df = val_df.drop(columns = empty_cols)

# Determine the Threshold
threshold_non_null = 20000

# Identify the columns to drop from development data
columns_to_drop_dev = dev_df.columns[dev_df.isnull().sum() > (len(dev_df) - threshold_non_null)]
# Identify the columns to drop from validation data
columns_to_drop_val = val_df.columns[val_df.isnull().sum() > (len(val_df) - threshold_non_null)]

# Drop Columns From development data
dev_df = dev_df.drop(columns = columns_to_drop_dev)
# Drop Columns From validation data
val_df = val_df.drop(columns = columns_to_drop_val)


# Impute remaining missing values
for column in dev_df.columns:
    if dev_df[column].isnull().any():
        median_value = dev_df[column].median()
        dev_df[column] = dev_df[column].fillna(median_value)

for column in val_df.columns:
    if val_df[column].isnull().any():
        median_value = val_df[column].median()
        val_df[column] = val_df[column].fillna(median_value)

# Separate features and target variable
X = dev_df.drop(columns = ['bad_flag', 'account_number'])
y = dev_df['bad_flag']


# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled, columns=X.columns)

# Perform feature selection with Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_selector = SelectFromModel(estimator=rf_model, threshold='median')
rf_selector.fit(X_resampled, y_resampled)

# Get the selected feature names
selected_features = X.columns[rf_selector.get_support()]

# Apply the feature selection on our training and validation dataframes
X_resampled_selected = rf_selector.transform(X_resampled)
val_df_selected = val_df[selected_features]
X_resampled_selected = pd.DataFrame(X_resampled_selected, columns=selected_features)


# Drop Empty Columns, after feature selection
empty_cols = ['bureau_447', 'bureau_436']
for col in empty_cols:
    if col in X_resampled_selected:
        X_resampled_selected = X_resampled_selected.drop(columns=col)
    if col in val_df_selected:
        val_df_selected = val_df_selected.drop(columns=col)

#Split Training Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X_resampled_selected, y_resampled, test_size=0.2, random_state=42)


# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
val_df_scaled = scaler.transform(val_df_selected)

# Initialize the LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [200, 300, 400, 500, 600],
    'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1, 0.12],
    'num_leaves': [25, 30, 35, 40, 50, 60],
    'max_depth': [-1, 5, 10, 15, 20],
    'min_data_in_leaf': [20, 40, 60, 80],
    'feature_fraction': [0.6, 0.7, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9],
    'bagging_freq': [1, 3, 5]
}


# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(lgbm_model, param_distributions=param_grid, n_iter=15, scoring='roc_auc', cv=3, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred_proba_test = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_test = best_model.predict(X_test_scaled)


# Evaluate the model's performance
roc_auc = roc_auc_score(y_test, y_pred_proba_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print("\n=== Model Performance on Resampled Test Data ===")
print(f"AUC-ROC: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Make predictions on the validation data
y_pred_proba_validation = best_model.predict_proba(val_df_scaled)[:, 1]


# Create the submission file
submission_df = pd.DataFrame({
    'account_number': val_df['account_number'],
    'predicted_probability': y_pred_proba_validation
})

submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' has been created.")

Development Data Loaded Successfully.
Validation Data Loaded Successfully.
[LightGBM] [Info] Number of positive: 51344, number of negative: 51394
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.095267 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 249389
[LightGBM] [Info] Number of data points in the train set: 102738, number of used features: 1005
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499757 -> initscore=-0.000973
[LightGBM] [Info] Start training from score -0.000973
[LightGBM] [Info] Number of positive: 51345, number of negative: 51394
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102285 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 249521
[LightGBM] [Info] Number of data points in the train set: 102739, number of used features: 1005
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499762 -> i

In [2]:
import pandas as pd
import numpy as np
import os
import glob
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Specify the directories for development and validation data
dev_data_path = r"/teamspace/studios/this_studio/heckme/Dev_data_to_be_shared_3"
val_data_path = r"/teamspace/studios/this_studio/heckme/validation_data_to_be_shared_3"

# Function to load and concatenate all CSV files from a directory
def load_all_csvs_from_directory(directory_path):
    all_files = glob.glob(os.path.join(directory_path, "*.csv"))
    if not all_files:
        raise FileNotFoundError(f"No CSV files found in the directory: {directory_path}")
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Load the development data
try:
    dev_df = load_all_csvs_from_directory(dev_data_path)
    print("Development Data Loaded Successfully.")
except FileNotFoundError as e:
    print(e)
    exit()

# Load the validation data
try:
    val_df = load_all_csvs_from_directory(val_data_path)
    print("Validation Data Loaded Successfully.")
except FileNotFoundError as e:
    print(e)
    exit()

# Store initial dev_df bad_flag value
if 'bad_flag' in dev_df:
    y = dev_df['bad_flag'].copy() # keep a copy of the bad_flag for SMOTE.
else:
    print("Error: 'bad_flag' column not found in the development data.")
    exit()


# Combine all data
all_data = pd.concat([dev_df.drop(columns=['bad_flag']), val_df], ignore_index=True)

# Drop empty columns
empty_cols = ['bureau_447', 'bureau_436']
all_data.drop(columns=empty_cols, inplace = True, errors = 'ignore')

# Determine the threshold for high missing columns
threshold_non_null = 20000

# Identify high missing columns
columns_to_drop = all_data.columns[all_data.isnull().sum() > (len(all_data) - threshold_non_null)]

# Drop high missing columns
all_data.drop(columns=columns_to_drop, inplace=True)

# Impute remaining missing values in the concatenated data
for col in all_data.columns:
        median_value = all_data[col].median()
        all_data[col] = all_data[col].fillna(median_value)

# Separate Dataframes again
dev_df = all_data[:len(dev_df)]
val_df = all_data[len(dev_df):]
val_df = val_df.reset_index(drop=True)
dev_df['bad_flag'] = y

# Separate features and target variable
X = dev_df.drop(columns = ['bad_flag', 'account_number'])
y = dev_df['bad_flag']

# Impute target variable (bad_flag) missing values
if y.isnull().any():
    median_y = y.median()
    y = y.fillna(median_y)

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled, columns = X.columns)

# Perform feature selection with Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_selector = SelectFromModel(estimator=rf_model, threshold='median')
rf_selector.fit(X_resampled, y_resampled)

# Get the selected feature names
selected_features = X.columns[rf_selector.get_support()]

# Apply the feature selection on our training and validation dataframes
X_resampled_selected = rf_selector.transform(X_resampled)
val_df_selected = val_df[selected_features]
X_resampled_selected = pd.DataFrame(X_resampled_selected, columns = selected_features)

# Drop Empty Columns, after feature selection
empty_cols = ['bureau_447', 'bureau_436']
for col in empty_cols:
    if col in X_resampled_selected:
        X_resampled_selected = X_resampled_selected.drop(columns=col)
    if col in val_df_selected:
        val_df_selected = val_df_selected.drop(columns=col)

#Split Training Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X_resampled_selected, y_resampled, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
val_df_scaled = scaler.transform(val_df_selected)

# Initialize the LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [200, 300, 400, 500, 600],
    'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1, 0.12],
    'num_leaves': [25, 30, 35, 40, 50, 60],
    'max_depth': [-1, 5, 10, 15, 20],
    'min_data_in_leaf': [20, 40, 60, 80],
    'feature_fraction': [0.6, 0.7, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9],
    'bagging_freq': [1, 3, 5]
}

# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(lgbm_model, param_distributions=param_grid, n_iter=15, scoring='roc_auc', cv=3, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred_proba_test = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_test = best_model.predict(X_test_scaled)

# Evaluate the model's performance
roc_auc = roc_auc_score(y_test, y_pred_proba_test) * 100
precision = precision_score(y_test, y_pred_test) * 100
recall = recall_score(y_test, y_pred_test) * 100
f1 = f1_score(y_test, y_pred_test) * 100
accuracy = accuracy_score(y_test, y_pred_test) * 100


print("\n=== Model Performance on Resampled Test Data ===")
print(f"AUC-ROC: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Make predictions on the validation data
y_pred_proba_validation = best_model.predict_proba(val_df_scaled)[:, 1]

# Convert to percentage strings
y_pred_proba_validation_percent = [f"{prob * 100:.6f}%" for prob in y_pred_proba_validation]

# Create the submission file
submission_df = pd.DataFrame({
    'account_number': val_df['account_number'],
    'predicted_probability': y_pred_proba_validation_percent
})

submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' has been created.")

#Show first 10 rows of submission
print("\nFirst 10 rows of submission.csv:")
print(pd.read_csv('submission.csv').head(10))

Development Data Loaded Successfully.
Validation Data Loaded Successfully.


  dev_df['bad_flag'] = y
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_df['bad_flag'] = y


[LightGBM] [Info] Number of positive: 51344, number of negative: 51394
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.203219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 249534
[LightGBM] [Info] Number of data points in the train set: 102738, number of used features: 1007
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499757 -> initscore=-0.000973
[LightGBM] [Info] Start training from score -0.000973
[LightGBM] [Info] Number of positive: 51345, number of negative: 51394
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.156811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 249663
[LightGBM] [Info] Number of data points in the train set: 102739, number of used features: 1007
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499762 -> initscore=-0.000954
[LightGBM] [Info] Start training from score -0.000954
[L

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def perform_credit_eda(dev_df):
    """
    Perform comprehensive EDA on credit card data
    """
    # Create a directory for saving plots
    import os
    if not os.path.exists('eda_plots'):
        os.makedirs('eda_plots')
    
    print("\n=== Basic Data Analysis ===")
    print(f"Total number of records: {len(dev_df)}")
    print(f"\nDefault Rate: {(dev_df['bad_flag'].mean() * 100):.2f}%")
    
    # 1. Missing Value Analysis
    print("\n=== Missing Value Analysis ===")
    missing_vals = dev_df.isnull().sum()
    missing_vals_pct = (missing_vals / len(dev_df)) * 100
    missing_df = pd.DataFrame({
        'Missing Values': missing_vals,
        'Percentage': missing_vals_pct
    }).sort_values('Percentage', ascending=False)
    
    print("\nTop 10 columns with highest missing values:")
    print(missing_df[missing_df['Percentage'] > 0].head(10))
    
    # 2. Target Variable Distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(data=dev_df, x='bad_flag')
    plt.title('Distribution of Default vs Non-Default')
    plt.savefig('eda_plots/target_distribution.png')
    plt.close()
    
    # 3. Feature Type Analysis
    feature_types = pd.DataFrame({
        'Type': dev_df.dtypes,
        'Non-Null Count': dev_df.count(),
        'Unique Values': dev_df.nunique()
    })
    print("\n=== Feature Type Analysis ===")
    print(feature_types)
    
    # 4. Transaction Attributes Analysis
    transaction_cols = [col for col in dev_df.columns if col.startswith('transaction_attribute')]
    if transaction_cols:
        trans_stats = dev_df[transaction_cols].describe()
        print("\n=== Transaction Attributes Summary ===")
        print(trans_stats)
        
        # Boxplot for top 10 transaction attributes
        plt.figure(figsize=(15, 6))
        dev_df[transaction_cols[:10]].boxplot()
        plt.xticks(rotation=90)
        plt.title('Top 10 Transaction Attributes Distribution')
        plt.tight_layout()
        plt.savefig('eda_plots/transaction_distribution.png')
        plt.close()
    
    # 5. Bureau Attributes Analysis
    bureau_cols = [col for col in dev_df.columns if col.startswith('bureau') and not col.startswith('bureau_enquiry')]
    if bureau_cols:
        bureau_stats = dev_df[bureau_cols].describe()
        print("\n=== Bureau Attributes Summary ===")
        print(bureau_stats)
    
    # 6. Correlation Analysis
    # Select numerical columns
    numerical_cols = dev_df.select_dtypes(include=[np.number]).columns
    correlation_matrix = dev_df[numerical_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, 
                cmap='coolwarm', 
                center=0,
                annot=False)
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.savefig('eda_plots/correlation_heatmap.png')
    plt.close()
    
    # 7. Top Correlations with Target
    if 'bad_flag' in numerical_cols:
        correlations_with_target = correlation_matrix['bad_flag'].sort_values(ascending=False)
        print("\n=== Top 10 Features Correlated with Default ===")
        print(correlations_with_target[1:11])  # Excluding bad_flag itself
        
        # Plot top correlations
        plt.figure(figsize=(12, 6))
        correlations_with_target[1:11].plot(kind='bar')
        plt.title('Top 10 Features Correlated with Default')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('eda_plots/top_correlations.png')
        plt.close()
    
    # 8. Statistical Tests
    print("\n=== Statistical Analysis ===")
    # Perform t-tests for numerical features
    significant_features = []
    for col in numerical_cols:
        if col != 'bad_flag' and col != 'account_number':
            try:
                t_stat, p_val = stats.ttest_ind(
                    dev_df[dev_df['bad_flag'] == 1][col].dropna(),
                    dev_df[dev_df['bad_flag'] == 0][col].dropna()
                )
                if p_val < 0.05:
                    significant_features.append({
                        'Feature': col,
                        'T-statistic': t_stat,
                        'P-value': p_val
                    })
            except:
                continue
    
    if significant_features:
        sig_df = pd.DataFrame(significant_features)
        sig_df = sig_df.sort_values('P-value')
        print("\nTop 10 Statistically Significant Features:")
        print(sig_df.head(10))
    
    # Save key findings to a file
    with open('eda_plots/eda_summary.txt', 'w') as f:
        f.write("=== Credit Card Default Analysis Summary ===\n\n")
        f.write(f"Total Records: {len(dev_df)}\n")
        f.write(f"Default Rate: {(dev_df['bad_flag'].mean() * 100):.2f}%\n\n")
        f.write("Files Generated:\n")
        f.write("1. target_distribution.png - Shows the distribution of defaults\n")
        f.write("2. transaction_distribution.png - Distribution of transaction attributes\n")
        f.write("3. correlation_heatmap.png - Correlation matrix visualization\n")
        f.write("4. top_correlations.png - Top features correlated with default\n")

    print("\nEDA completed! Check the 'eda_plots' directory for generated visualizations and eda_summary.txt for key findings.")

# Run the EDA function
print("Starting Exploratory Data Analysis...")
perform_credit_eda(dev_df)

Starting Exploratory Data Analysis...

=== Basic Data Analysis ===
Total number of records: 96806

Default Rate: 1.41%

=== Missing Value Analysis ===

Top 10 columns with highest missing values:
          Missing Values  Percentage
bad_flag           62295    64.35035

=== Feature Type Analysis ===
                            Type  Non-Null Count  Unique Values
account_number           float64           96806          34512
onus_attribute_1         float64           96806            723
transaction_attribute_1  float64           96806             25
transaction_attribute_2  float64           96806              4
transaction_attribute_3  float64           96806             25
...                          ...             ...            ...
0.602                    float64           96806              1
0.603                    float64           96806             10
0.604                    float64           96806             13
4.11                     float64           96806           


=== Transaction Attributes Summary ===
       transaction_attribute_1  transaction_attribute_2  \
count             96806.000000             96806.000000   
mean                  0.478297                 0.000331   
std                  50.122702                 0.023616   
min                   0.000000                 0.000000   
25%                   0.000000                 0.000000   
50%                   0.000000                 0.000000   
75%                   0.000000                 0.000000   
max               12000.000000                 4.000000   

       transaction_attribute_3  transaction_attribute_4  \
count             96806.000000             96806.000000   
mean                  0.421059                16.993931   
std                  47.185859              1266.942932   
min                   0.000000           -109800.476600   
25%                   0.000000                 0.000000   
50%                   0.000000                 0.000000   
75%            

  res = hypotest_fun_out(*samples, **kwds)



Top 10 Statistically Significant Features:
              Feature  T-statistic        P-value
403        0.546873.1  -184.447901   0.000000e+00
402          0.546873  -184.447901   0.000000e+00
400             0.560  -184.447901   0.000000e+00
390       0.128329412  -184.447901   0.000000e+00
389       0.536842424    22.090082  2.189646e-107
397      Unnamed: 314    22.090082  2.189646e-107
394      Unnamed: 305   -21.482421  1.056148e-101
391        0.59095129    21.482421  1.056148e-101
401       0.476586043    21.482421  1.056148e-101
330  onus_attribute_2    21.366433  1.233339e-100

EDA completed! Check the 'eda_plots' directory for generated visualizations and eda_summary.txt for key findings.
