## 5. Association Between Features

### Step 1: Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset (replace 'dataset.csv' with your file path)
df = pd.read_csv('dataset.csv')

# Display dataset overview
print("Dataset Overview:")
print(df.info())


### Step 2: Data Preprocessing

In [None]:
# Select Relevant Columns and Handle Missing Values

# Select numerical columns for correlation analysis
columns_of_interest = ['Rating', 'Size', 'Price', 'Minimum Installs', 'Maximum Installs']
df_subset = df[columns_of_interest]

# Handle missing values
df_subset['Size'] = pd.to_numeric(df_subset['Size'], errors='coerce')  # Convert size to numeric
df_subset['Price'] = pd.to_numeric(df_subset['Price'], errors='coerce')  # Convert price to numeric
df_subset = df_subset.dropna()  # Drop rows with missing values
df_subset['Price'] = df_subset['Price'].fillna(0)  # Replace missing price with 0
df_subset['Size'] = df_subset['Size'].fillna(df_subset['Size'].mean())  # Replace missing size with mean


### Step 3: Visualize Correlations Using Heatmap

In [None]:
# Compute correlation matrix
correlation_matrix = df_subset.corr()

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Between Features', fontsize=16)
plt.show()

# Highlight strong correlations
strong_correlations = correlation_matrix[(correlation_matrix > 0.7) | (correlation_matrix < -0.7)]
print("Strong correlations (> 0.7 or < -0.7):\n", strong_correlations)


### Step 4: Feedback Loop with User Interaction

In [None]:
# Suggest Feature Engineering Based on Correlations

print("\nFeedback Options:")
print("1. Create a new feature: Price per Install.")
print("2. Filter data by specific criteria (e.g., apps with ratings above 4.0).")
print("3. Remove highly correlated features and reanalyze.")
print("4. Proceed without changes.")

# User input
choice = int(input("Enter your choice (1, 2, 3, or 4): "))

if choice == 1:
    # Create a new feature: Price per Install
    df_subset['Price per Install'] = df_subset['Price'] / df_subset['Maximum Installs']
    df_subset['Price per Install'].replace([np.inf, -np.inf], 0, inplace=True)  # Handle division by zero
    print("Feature 'Price per Install' added. Recalculating correlations...")
elif choice == 2:
    # Filter data based on criteria
    df_subset = df_subset[df_subset['Rating'] > 4.0]
    print("Filtered data for apps with ratings above 4.0. Recalculating correlations...")
elif choice == 3:
    # Remove one of each pair of highly correlated features
    high_corr_pairs = [
        (i, j) for i in correlation_matrix.columns for j in correlation_matrix.columns
        if (i != j) and abs(correlation_matrix.loc[i, j]) > 0.7
    ]
    for i, j in high_corr_pairs:
        if i in df_subset.columns:
            df_subset = df_subset.drop(columns=[i])
            print(f"Removed highly correlated feature: {i}")
            break
elif choice == 4:
    print("Proceeding without changes.")
else:
    print("Invalid choice. Proceeding without changes.")


### Step 5: Recalculate and Visualize Updated Correlations

In [None]:
# Recompute correlation matrix
updated_correlation_matrix = df_subset.corr()

# Plot updated heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(updated_correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Updated Correlation Between Features', fontsize=16)
plt.show()

# Highlight strong correlations in the updated matrix
updated_strong_correlations = updated_correlation_matrix[(updated_correlation_matrix > 0.7) | (updated_correlation_matrix < -0.7)]
print("Updated Strong correlations (> 0.7 or < -0.7):\n", updated_strong_correlations)
