## Overview of Data Distribution

### Step 1: Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset (replace 'dataset.csv' with your file)
df = pd.read_csv('Google-Playstore-Preprocessed.csv')

# Display a summary of the dataset
print("Dataset Overview:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

### Step 2: Visualize Data Distributions

In [None]:
# Numerical columns for visualization
numerical_features = ['Rating', 'Price', 'Rating Count']

for feature in numerical_features:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[feature], kde=True, bins=30, color='skyblue')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Categorical column visualization
plt.figure(figsize=(12, 6))
sns.countplot(y='Category', data=df, order=df['Category'].value_counts().index, palette='viridis')
plt.title('Distribution of Apps by Category')
plt.xlabel('Number of Apps')
plt.ylabel('Category')
plt.show()


### Step 3: Feedback Loop for Data Transformation

In [None]:
# Handle Missing Values and Outliers

# Provide feedback to the user
print("\nFeedback Options:")
print("1. Handle missing values (impute missing ratings).")
print("2. Cap outliers in 'Price' or 'Rating Count'.")
print("3. Proceed without transformation.")

# User input
choice = int(input("Enter your choice (1, 2, or 3): "))

if choice == 1:
    # Handle missing values by imputing with the mean
    if df['Rating'].isnull().sum() > 0:
        mean_rating = df['Rating'].mean()
        df['Rating'].fillna(mean_rating, inplace=True)
        print(f"Missing 'Rating' values filled with mean: {mean_rating:.2f}")
    else:
        print("No missing values in 'Rating' to handle.")
elif choice == 2:
    # Handle outliers by capping at the 99th percentile
    price_cap = df['Price'].quantile(0.99)
    rating_count_cap = df['Rating Count'].quantile(0.99)
    df['Price'] = np.where(df['Price'] > price_cap, price_cap, df['Price'])
    df['Rating Count'] = np.where(df['Rating Count'] > rating_count_cap, rating_count_cap, df['Rating Count'])
    print(f"'Price' and 'Rating Count' capped at 99th percentiles: {price_cap:.2f}, {rating_count_cap:.0f}")
elif choice == 3:
    print("Proceeding without transformation.")
else:
    print("Invalid choice. No transformations applied.")

### Step 4: Re-Evaluate Distributions Post-Transformation

In [None]:
# Re-plot distributions to compare before and after transformations
print("\nRe-evaluating distributions after transformations...")

for feature in numerical_features:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[feature], kde=True, bins=30, color='lightcoral')
    plt.title(f'Revised Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

# Save the transformed dataset for further use
df.to_csv('transformed_dataset.csv', index=False)
print("\nTransformed dataset saved as 'transformed_dataset.csv'.")
