## 7. Price Analysis of Free vs. Paid Apps

### Step 1: Import Libraries and Load Data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load dataset (replace 'dataset.csv' with your file path)
df = pd.read_csv('dataset.csv')

# Overview of dataset
print("Dataset Overview:")
print(df.info())


### Step 2: Data Preprocessing

In [None]:
# Handle missing values for Price and Rating
df['Price'] = pd.to_numeric(df['Price'], errors='coerce').fillna(0)  # Convert Price and fill missing values
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())  # Fill missing ratings with mean
df['Minimum Installs'] = pd.to_numeric(df['Minimum Installs'], errors='coerce').fillna(0)

# Create a new column to classify apps as Free (0) or Paid (1)
df['Paid'] = df['Price'].apply(lambda x: 1 if x > 0 else 0)

# Verify preprocessing
print(df[['Price', 'Paid']].head())


### Step 3: Visualization

In [None]:
# Compare Free vs. Paid App Prices Using Box Plots

plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Paid', y='Price', showfliers=False)
plt.xticks([0, 1], ['Free', 'Paid'])
plt.title('Price Distribution: Free vs. Paid Apps')
plt.xlabel('App Type')
plt.ylabel('Price')
plt.show()

# Optional: Include outliers in a separate box plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Paid', y='Price', showfliers=True)
plt.xticks([0, 1], ['Free', 'Paid'])
plt.title('Price Distribution (With Outliers): Free vs. Paid Apps')
plt.xlabel('App Type')
plt.ylabel('Price')
plt.show()


### Step 4: Feedback Loop for Deeper Insights

In [None]:
# User Interaction to Focus on Specific Install Ranges

# Prompt user for interaction
print("\nFeedback Options:")
print("1. Focus on apps with installs over 1 million.")
print("2. Focus on apps with installs under 1 million.")
print("3. Proceed with the entire dataset.")
choice = int(input("Enter your choice (1, 2, or 3): "))

if choice == 1:
    filtered_df = df[df['Minimum Installs'] > 1_000_000]
    print(f"Filtered dataset contains {len(filtered_df)} apps with installs over 1 million.")
elif choice == 2:
    filtered_df = df[df['Minimum Installs'] <= 1_000_000]
    print(f"Filtered dataset contains {len(filtered_df)} apps with installs under or equal to 1 million.")
elif choice == 3:
    filtered_df = df
    print("Using the entire dataset.")
else:
    print("Invalid choice. Proceeding with the entire dataset.")
    filtered_df = df

# Visualization for filtered dataset
plt.figure(figsize=(10, 6))
sns.boxplot(data=filtered_df, x='Paid', y='Price', showfliers=False)
plt.xticks([0, 1], ['Free', 'Paid'])
plt.title('Price Distribution: Free vs. Paid Apps (Filtered)')
plt.xlabel('App Type')
plt.ylabel('Price')
plt.show()


### Step 5: Analysis of Ratings for Free vs. Paid Apps

In [None]:
# Visualize Ratings Distribution After Filtering

plt.figure(figsize=(10, 6))
sns.boxplot(data=filtered_df, x='Paid', y='Rating', showfliers=False)
plt.xticks([0, 1], ['Free', 'Paid'])
plt.title('Rating Distribution: Free vs. Paid Apps (Filtered)')
plt.xlabel('App Type')
plt.ylabel('Rating')
plt.show()
