In [None]:
# Graph of Income Distribution
# Clean NA values in the "AMT_INCOME_TOTAL" column
df_cleaned = df_e.dropna(subset=['AMT_INCOME_TOTAL'])

# Convert the "AMT_INCOME_TOTAL" column to numeric
df_cleaned['AMT_INCOME_TOTAL'] = pd.to_numeric(df_cleaned['AMT_INCOME_TOTAL'], errors='coerce')

# Cap the income at 1 million
df_cleaned['AMT_INCOME_TOTAL'] = df_cleaned['AMT_INCOME_TOTAL'].clip(upper=1000000)

# Convert income to hundreds of thousands for the x-axis
df_cleaned['AMT_INCOME_TOTAL_hundreds_thousands'] = df_cleaned['AMT_INCOME_TOTAL'] / 100000

# Plot a histogram
plt.figure(figsize=(4, 3))
plt.hist(df_cleaned['AMT_INCOME_TOTAL_hundreds_thousands'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Income Explosion')
plt.xlabel('Income (Hundreds of Thousands)')
plt.ylabel('Frequency')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
# Set up subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot top 30 positive correlations
axes[0].barh(top_positive_correlations.index, top_positive_correlations.values, color='red')
axes[0].set_title('Top 30 Positive Correlations')
axes[0].set_xlabel('Correlation')
axes[0].set_ylabel('Features')

# Plot top 30 negative correlations
axes[1].barh(top_negative_correlations.index, top_negative_correlations.values, color='gray')
axes[1].set_title('Top 30 Negative Correlations')
axes[1].set_xlabel('Correlation')
axes[1].set_ylabel('Features')

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()


In [None]:
# Graph of top 10 missing data columns

# Selecting only the top 20 rows
top_20_missing_data = missing_data.head(20)

# Increase the figure size to accommodate more space for labels
plt.figure(figsize=(10, 8))

# Plotting
top_20_missing_data.plot(kind='barh', color='red')  # Horizontal bar plot

# Add title and labels with smaller font size
plt.title('Top 20 Columns with Missing Data', fontsize=14)
plt.xlabel('Percentage Missing', fontsize=12)
plt.ylabel('Columns', fontsize=12)

# Decrease the font size of the ticks
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Add more space between bars
plt.tight_layout(pad=3.0)

plt.show()

In [None]:
# Define your features (X) and target variable (y) based on your EDA
X_train = df_downsampled.drop(columns=['TARGET'])
y_train = df_downsampled['TARGET']

df_e_encoded.drop(columns=['TARGET'], inplace=True)

X_test = df_e_encoded
y_test = None  # You don't have the ground truth labels for the test set


# Check the shape of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)

In [None]:
# Drop the 'NAME_CONTRACT_TYPE' column from both X_train and X_val
#X_train.drop(columns=['ORGANIZATION_TYPE'], inplace=True, errors='ignore')  
#X_train.drop(columns=['NAME_CASH_LOAN_PURPOSE'], inplace=True, errors='ignore') 
#X_train.drop(columns=['NAME_GOODS_CATEGORY'], inplace=True, errors='ignore') 
#X_val.drop(columns=['ORGANIZATION_TYPE'], inplace=True, errors='ignore')  
#X_val.drop(columns=['NAME_CASH_LOAN_PURPOSE'], inplace=True, errors='ignore')  
#X_val.drop(columns=['NAME_GOODS_CATEGORY'], inplace=True, errors='ignore') 
# Initialize logistic regression model
# model = LogisticRegression(random_state=42, max_iter=1000)

# Fit the model on training data
# model.fit(X_train, y_train)

# Make predictions on validation set
# y_pred = model.predict(X_val)
# y_proba = model.predict_proba(X_val)[:, 1]

# Calculate evaluation metrics
# accuracy = accuracy_score(y_val, y_pred)
# recall = recall_score(y_val, y_pred)
# f1 = f1_score(y_val, y_pred)
# roc_auc = roc_auc_score(y_val, y_proba)

# Print the evaluation metrics
# print("Performance Metrics:")
# print("Accuracy:", accuracy)
# print("Recall:", recall)
# print("F1-score:", f1)
# print("AUC-ROC Score:", roc_auc)

In [None]:
# Train on Validation Set

# Define your features (X) and target variable (y) based on your EDA
X = df_downsampled.drop(columns=['TARGET'])
y = df_downsampled['TARGET']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

random_forest = RandomForestClassifier(
    n_estimators=500,  # Number of trees in the forest
    max_depth=20,      # Maximum depth of the trees
    min_samples_split=4,  # Minimum number of samples required to split a node
    min_samples_leaf=2,   # Minimum number of samples required at each leaf node
    random_state=144     # Random seed for reproducibility
)

# Initialize models
random_forest = RandomForestClassifier(random_state=42)
gradient_boosting = GradientBoostingClassifier(random_state=42)

# Fit models
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)

# Make predictions
y_pred_rf = random_forest.predict(X_val)
y_pred_gb = gradient_boosting.predict(X_val)

# Calculate evaluation metrics
metrics_rf_gb = [accuracy_score, recall_score, f1_score]

results = {}

In [None]:
# Random Forest Metrics
results['Random Forest'] = {metric.__name__: metric(y_val, y_pred_rf) for metric in metrics_rf_gb}

# Gradient Boosting Metrics
results['Gradient Boosting'] = {metric.__name__: metric(y_val, y_pred_gb) for metric in metrics_rf_gb}

# Print results
for model, metrics in results.items():
    print(f"{model} Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print()

In [None]:
# Initialize models
random_forest = RandomForestClassifier(
    n_estimators=800,  
    max_depth=60,      
    min_samples_split=2,  
    min_samples_leaf=1,   
    random_state=144     
)

gradient_boosting = GradientBoostingClassifier(random_state=42)

# Fit models
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)

# Make predictions using predict_proba
y_pred_rf_proba = random_forest.predict_proba(X_test)[:, 1]  # Probability of positive class
y_pred_gb_proba = gradient_boosting.predict_proba(X_test)[:, 1]  # Probability of positive class

# Create DataFrame with predictions
predictions_df = pd.DataFrame({
    'SK_ID_CURR': df_e_encoded['SK_ID_CURR'],  # Assuming df_e_encoded contains the original data with IDs
    'TARGET': y_pred_gb_proba
})

# Save DataFrame to CSV
predictions_df.to_csv('predictions.csv', index=False)


In [None]:
# Initialize the Gradient Boosting model
gradient_boosting = GradientBoostingClassifier(random_state=42)

# Fit the model
gradient_boosting.fit(X_train, y_train)

# Get feature importances
feature_importances = gradient_boosting.feature_importances_

# Create a DataFrame to store feature importances
importance_df = pd.DataFrame({
    'Feature': X_train.columns,  # Assuming X_train is your feature matrix
    'Importance': feature_importances
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print or save the DataFrame
print(importance_df)
# importance_df.to_csv('feature_importance.csv', index=False)


In [None]:
import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(10, 6))
# Plotting the first 5 features in red
plt.bar(importance_df['Feature'][:5], importance_df['Importance'][:5], color='red')
# Plotting the second 5 features in gray
plt.bar(importance_df['Feature'][5:10], importance_df['Importance'][5:10], color='gray')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Top 10 Feature Importances')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()