In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio  # Fixed typo: poi -> pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os

# Download vader lexicon
nltk.download('vader_lexicon', quiet=True)

# Load data
apps_df = pd.read_csv('Downloads/Play Store Data.csv')
reviews_df = pd.read_csv('Downloads/User Reviews.csv')

# Step 2: Data Cleaning 
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating'] <= 5]
reviews_df.dropna(subset=['Translated_Review'], inplace=True)

# Convert Installs column to numeric by removing commas and +
apps_df['Installs'] = apps_df['Installs'].astype(str).str.replace(',', '').str.replace('+', '').astype(int)

# Convert Price column to numeric after removing $
apps_df['Price'] = apps_df['Price'].astype(str).str.replace('$', '').astype(float)

# Merge dataframes
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')

# Convert Size column
def convert_size(size):
    if isinstance(size, str):
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', ''))/1024
        elif size == 'Varies with device':
            return np.nan
    return np.nan

apps_df['Size'] = apps_df['Size'].apply(convert_size)

# Create derived features
apps_df['LogInstalls'] = np.log(apps_df['Installs'])
apps_df['Reviews'] = apps_df['Reviews'].astype(int)
apps_df['LogReviews'] = np.log(apps_df['Reviews'] + 1)  # Added +1 to avoid log(0)

def rating_group(rating):
    if rating >= 4:
        return 'Top rated app'
    elif rating >= 3:
        return 'Good rated app'
    elif rating >= 2:
        return 'Average rated app'
    else:
        return 'Below average app'

# Convert Rating to numeric first, then apply grouping
apps_df['Rating_Original'] = pd.to_numeric(apps_df['Rating'], errors='coerce')
apps_df['Rating_Group'] = apps_df['Rating_Original'].apply(rating_group)

# Revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
reviews_df["sentiment_Score"] = reviews_df["Translated_Review"].apply(lambda x: sia.polarity_scores(str(x))['compound'])

# Date processing
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

# Setup for HTML output
html_files_path = "./"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

plot_containers = ""

# Define consistent color scheme
color_sequence = px.colors.sequential.Viridis  # Consistent color scheme for all plots
title_font_size = 16
axis_font_size = 12
plot_width = 400
plot_height = 300
bg_color = 'black'
font_color = 'white'

def save_plot_as_html(fig, filename, insight):
    global plot_containers
    
    file_path = os.path.join(html_files_path, filename)
    
    # Convert fig to inline HTML
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    
    # Add plot container block
    plot_containers += f"""
    <div class="plot_container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    
    # Write separate HTML file
    fig.write_html(file_path, full_html=False, include_plotlyjs="inline")

# Figure 1: Top Categories
category_counts = apps_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x': 'Category', 'y': 'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=color_sequence,
    width=plot_width,
    height=plot_height
)
fig1.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}, tickangle=45),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=80)
)
save_plot_as_html(fig1, 'Category_Graph_1.html', 
                  "The top categories on Play Store are dominated by Tools, Entertainment, and Productivity apps")

# Figure 2: App Type Distribution
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=plot_width,
    height=plot_height
)
fig2.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    margin=dict(l=10, r=10, t=50, b=10)
)
save_plot_as_html(fig2, "Type_Graph_2.html", 
                  "Most apps on Play Store are free, indicating a strategy to attract users first and monetize through ads or in-app purchases")

# Figure 3: Rating Distribution (using original numeric ratings)
fig3 = px.histogram(
    apps_df.dropna(subset=['Rating_Original']),
    x='Rating_Original',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=plot_width,
    height=plot_height
)
fig3.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=10)
)
save_plot_as_html(fig3, 'Rating_Graph_3.html', 
                  "Most apps have ratings between 4.0 and 4.5, indicating generally positive user satisfaction")

# Figure 4: Sentiment Distribution (create Sentiment column first)
def get_sentiment_label(score):
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

reviews_df['Sentiment'] = reviews_df['sentiment_Score'].apply(get_sentiment_label)
sentiment_counts = reviews_df['Sentiment'].value_counts()
fig4 = px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x': 'Sentiment', 'y': 'Count'},
    title='Review Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width=plot_width,
    height=plot_height
)
fig4.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=10)
)
save_plot_as_html(fig4, 'Sentiment_Graph_4.html', 
                  "Majority of reviews are positive, followed by neutral and negative sentiments")

# Figure 5: Installs By Category
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Top Categories by Installs',
    color=installs_by_category.values,
    color_continuous_scale=px.colors.sequential.Blues,
    width=plot_width,
    height=plot_height
)
fig5.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=10)
)
save_plot_as_html(fig5, 'Installs_Graph_5.html', 
                  "Communication and Social categories have the highest number of installs")

# Figure 6: App Updates Per Year
update_per_year = apps_df['Year'].value_counts().sort_index()
fig6 = px.line(
    x=update_per_year.index,
    y=update_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='App Updates Per Year',
    width=plot_width,
    height=plot_height,
    color_discrete_sequence=[color_sequence[0]]
)
fig6.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=10)
)
save_plot_as_html(fig6, 'Updates_Graph_6.html', 
                  "App updates peaked in recent years, showing increased developer activity")

# Figure 7: Revenue By Category
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(
    x=revenue_by_category.values,
    y=revenue_by_category.index,
    orientation='h',
    labels={'x': 'Revenue', 'y': 'Category'},
    title='Top Categories by Revenue',
    color=revenue_by_category.values,
    color_continuous_scale=px.colors.sequential.Greens,
    width=plot_width,
    height=plot_height
)
fig7.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=10)
)
save_plot_as_html(fig7, 'Revenue_Graph_7.html', 
                  "Lifestyle and Productivity categories generate the highest revenue")

# Figure 8: Top Genres
genre_counts = apps_df['Genres'].str.split(';').explode().value_counts().nlargest(10)
fig8 = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Top 10 App Genres',
    color=genre_counts.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=plot_width,
    height=plot_height
)
fig8.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}, tickangle=45),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=80)
)
save_plot_as_html(fig8, 'Genres_Graph_8.html', 
                  "Tools, Entertainment, and Education are the most common genres")

# Figure 9: Impact of Last Update on Ratings (using scatter plot)
fig9 = px.scatter(
    apps_df.dropna(subset=['Last Updated', 'Rating_Original']).head(500),  # Limit to 500 points for performance
    x='Last Updated',
    y='Rating_Original',
    color='Type',
    title='Impact of Last Update on Ratings',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=plot_width,
    height=plot_height
)
fig9.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=10)
)
save_plot_as_html(fig9, 'Updates_Ratings_Graph_9.html', 
                  "Recently updated apps tend to have higher ratings")

# Figure 10: Rating for Paid vs Free Apps
fig10 = px.box(
    apps_df.dropna(subset=['Rating_Original']),
    x='Type',
    y='Rating_Original',
    color='Type',
    title='Ratings: Free vs Paid Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=plot_width,
    height=plot_height
)
fig10.update_layout(
    plot_bgcolor=bg_color,
    paper_bgcolor=bg_color,
    font_color=font_color,
    title_font={'size': title_font_size},
    xaxis=dict(title_font={'size': axis_font_size}),
    yaxis=dict(title_font={'size': axis_font_size}),
    margin=dict(l=10, r=10, t=50, b=10)
)
save_plot_as_html(fig10, 'Free_Paid_Graph_10.html', 
                  "Paid apps generally have higher ratings than free apps")

# Create dashboard HTML
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Analytics Dashboard</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 20px;
            height: 60px;
        }}
        .header h1 {{
            margin: 0;
            text-align: center;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
            gap: 20px;
        }}
        .plot_container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 15px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
            background-color: #222;
            border-radius: 10px;
            transition: transform 0.3s;
        }}
        .plot_container:hover {{
            transform: scale(1.02);
            border-color: #00bcd4;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.8);
            padding: 10px;
            border-radius: 5px;
            color: #fff;
            font-size: 12px;
            max-width: 200px;
            border: 1px solid #555;
        }}
        .plot_container:hover .insights {{
            display: block;
        }}
        .plot {{
            width: 100%;
            height: 100%;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/2/2f/Google_2015_logo.svg/320px-Google_2015_logo.svg.png" alt="Google Logo">
        <h1>Google Play Store Analytics Dashboard</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/320px-Google_Play_Store_badge_EN.svg.png" alt="Play Store Logo">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""

# Set consistent plot dimensions for dashboard
plot_width = 420
plot_height = 350

final_html = dashboard_html.format(
    plots=plot_containers,
    plot_width=plot_width,
    plot_height=plot_height
)

# Save and open dashboard
dashboard_path = os.path.join(html_files_path, "playstore_dashboard.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

print(f"Dashboard created: {dashboard_path}")
webbrowser.open('file://' + os.path.realpath(dashboard_path))


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Dashboard created: ./playstore_dashboard.html


True