2. Use a grouped bar chart to compare the average rating and total review count for the top 10 app categories
by number of installs. Filter out any categories where the average rating is below 4.0 and size below 10 M and last update should be Jan month . 
this graph should work only between 3PM IST to 5 PM IST apart from that time we should not show this graph in dashboard itself.


In [55]:
from datetime import datetime
import pytz
import pandas as pd

# Load the new dataset
apps_data = pd.read_csv('Play Store Data.csv')

# Convert data types and clean up
apps_data['Last Updated'] = pd.to_datetime(apps_data['Last Updated'], errors='coerce')
apps_data['Size'] = apps_data['Size'].replace('Varies with device', None)

In [3]:
#import pandas as pd

# Example data to demonstrate
#data = {'Size': ['1,000+', '500k', '1.5M', '20', '100', None, '']}
#apps_data = pd.DataFrame(data)

# Step 1: Clean the data in the 'Size' column
def clean_size_column(size):
    if pd.isnull(size) or size == '':  # Check for None, NaN, or empty strings
        return None
    try:
        # Remove commas and '+' symbols
        size = size.replace(',', '').replace('+', '')
        # Handle 'k' and 'M' (e.g., 500k -> 500000, 1.5M -> 1500000)
        if 'k' in size:
            size = float(size.replace('k', '')) * 1e3
        elif 'M' in size:
            size = float(size.replace('M', '')) * 1e6
        return float(size)
    except ValueError:
        return None  # Handle invalid cases gracefully

# Apply the cleaning function to the 'Size' column
apps_data['Size'] = apps_data['Size'].apply(clean_size_column)

# Check the result
print(apps_data)


                                                     App             Category  \
0         Photo Editor & Candy Camera & Grid & ScrapBook       ART_AND_DESIGN   
1                                    Coloring book moana       ART_AND_DESIGN   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...       ART_AND_DESIGN   
3                                  Sketch - Draw & Paint       ART_AND_DESIGN   
4                  Pixel Draw - Number Art Coloring Book       ART_AND_DESIGN   
...                                                  ...                  ...   
10836                                   Sya9a Maroc - FR               FAMILY   
10837                   Fr. Mike Schmitz Audio Teachings               FAMILY   
10838                             Parkinson Exercices FR              MEDICAL   
10839                      The SCP Foundation DB fr nn5n  BOOKS_AND_REFERENCE   
10840      iHoroscope - 2018 Daily Horoscope & Astrology            LIFESTYLE   

       Rating Reviews      

In [5]:
apps_data.dropna(subset=['Size'], inplace=True)

In [7]:
apps_data['Size'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_data['Size'].fillna(0, inplace=True)


In [9]:
apps_data['Size'] = apps_data['Size'].replace(r'[kM]+$', '', regex=True).astype(float)

In [11]:
# Step 1: Convert to string, replace NaN with empty strings
apps_data['Size'] = apps_data['Size'].astype(str)

# Step 2: Extract size multiplier (k or M), replace with corresponding numeric values, and handle missing cases
apps_data['Multiplier'] = apps_data['Size'].str.extract(r'[\d\.]+([kM])', expand=False)\
    .replace({'k': 1 / 1024, 'M': 1}).fillna(1).astype(float)

# Step 3: Remove non-numeric characters, convert to numeric, and multiply by the multiplier
apps_data['Size'] = (
    apps_data['Size'].str.replace(r'[^\d\.]', '', regex=True)  # Remove non-numeric characters
    .replace('', '0')  # Handle empty strings
    .astype(float) * apps_data['Multiplier']
)

# Drop the 'Multiplier' column if no longer needed
apps_data.drop(columns=['Multiplier'], inplace=True)

# Check the result
print(apps_data)


                                                     App        Category  \
0         Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN   
1                                    Coloring book moana  ART_AND_DESIGN   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN   
3                                  Sketch - Draw & Paint  ART_AND_DESIGN   
4                  Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN   
...                                                  ...             ...   
10835                                           FR Forms        BUSINESS   
10836                                   Sya9a Maroc - FR          FAMILY   
10837                   Fr. Mike Schmitz Audio Teachings          FAMILY   
10838                             Parkinson Exercices FR         MEDICAL   
10840      iHoroscope - 2018 Daily Horoscope & Astrology       LIFESTYLE   

       Rating Reviews        Size     Installs  Type Price Content Rating  \
0         

  .replace({'k': 1 / 1024, 'M': 1}).fillna(1).astype(float)


In [13]:
apps_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9146 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             9146 non-null   object        
 1   Category        9146 non-null   object        
 2   Rating          7730 non-null   float64       
 3   Reviews         9146 non-null   object        
 4   Size            9146 non-null   float64       
 5   Installs        9146 non-null   object        
 6   Type            9146 non-null   object        
 7   Price           9146 non-null   object        
 8   Content Rating  9145 non-null   object        
 9   Genres          9146 non-null   object        
 10  Last Updated    9145 non-null   datetime64[ns]
 11  Current Ver     9138 non-null   object        
 12  Android Ver     9143 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(10)
memory usage: 1000.3+ KB


In [15]:
# Step 1: Replace non-numeric values with '0' and convert to string for safe regex operations
apps_data['Installs'] = apps_data['Installs'].fillna('0').astype(str)

# Step 2: Remove non-numeric characters (commas, plus signs)
apps_data['Installs'] = apps_data['Installs'].replace(r'[\+,]', '', regex=True)

# Step 3: Convert to integers
apps_data['Installs'] = pd.to_numeric(apps_data['Installs'], errors='coerce').fillna(0).astype(int)

# Check the result
print(apps_data)


                                                     App        Category  \
0         Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN   
1                                    Coloring book moana  ART_AND_DESIGN   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN   
3                                  Sketch - Draw & Paint  ART_AND_DESIGN   
4                  Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN   
...                                                  ...             ...   
10835                                           FR Forms        BUSINESS   
10836                                   Sya9a Maroc - FR          FAMILY   
10837                   Fr. Mike Schmitz Audio Teachings          FAMILY   
10838                             Parkinson Exercices FR         MEDICAL   
10840      iHoroscope - 2018 Daily Horoscope & Astrology       LIFESTYLE   

       Rating Reviews        Size  Installs  Type Price Content Rating  \
0         4.1

In [27]:
apps_data['Reviews'] = apps_data['Reviews'].fillna('0').astype(str)
#apps_data['Reviews'] = apps_data['Reviews'].astype(int)
apps_data['Reviews'] = apps_data['Reviews'].str.replace(
    r'(\d+\.?\d*)([kM]+)', 
    lambda m: str(float(m.group(1)) * (1000 if m.group(2) == 'k' else 1_000_000)),
    regex=True
)


In [31]:
apps_data['Reviews'] = apps_data['Reviews'].astype(float)
print(apps_data)
apps_data.info()

                                                     App        Category  \
0         Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN   
1                                    Coloring book moana  ART_AND_DESIGN   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN   
3                                  Sketch - Draw & Paint  ART_AND_DESIGN   
4                  Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN   
...                                                  ...             ...   
10835                                           FR Forms        BUSINESS   
10836                                   Sya9a Maroc - FR          FAMILY   
10837                   Fr. Mike Schmitz Audio Teachings          FAMILY   
10838                             Parkinson Exercices FR         MEDICAL   
10840      iHoroscope - 2018 Daily Horoscope & Astrology       LIFESTYLE   

       Rating   Reviews        Size  Installs  Type Price Content Rating  \
0         4

In [53]:
#apps_data['Installs'] = apps_data['Installs'].replace('[\+,]', '', regex=True).astype(float)
#apps_data['Reviews'] = apps_data['Reviews'].replace('[\+,]', '', regex=True).astype(float)

# Filter data based on conditions
filtered_data = apps_data[
    (apps_data['Rating'] >= 4.0) &
    (apps_data['Size'] >= 10) &
    (apps_data['Last Updated'].dt.month == 1)
]

# Aggregate data for the top 10 categories by number of installs
top_categories = (
    filtered_data.groupby('Category', as_index=False)
    .agg({'Rating': 'mean', 'Reviews': 'sum', 'Installs': 'sum'})
    .nlargest(10, 'Installs')
)

# Check current time and restrict to 3 PM to 5 PM IST
ist_now = datetime.now(pytz.timezone('Asia/Kolkata'))
if ist_now.hour >= 15 and ist_now.hour < 17:
    import plotly.graph_objects as go

    # Create grouped bar chart
    fig = go.Figure()

    # Add traces for average ratings and total reviews
    fig.add_trace(go.Bar(
        x=top_categories['Category'],
        y=top_categories['Rating'],
        name='Average Rating',
        marker_color='blue'
    ))
    fig.add_trace(go.Bar(
        x=top_categories['Category'],
        y=top_categories['Reviews'],
        name='Total Reviews',
        marker_color='green'
    ))

    # Update layout
    fig.update_layout(
        title='Comparison of Average Rating and Total Reviews for Top 10 Categories',
        xaxis_title='Category',
        yaxis_title='Value',
        barmode='group',
        plot_bgcolor='black',
        paper_bgcolor='black',
        font_color='white',
        width=800,
        height=600
    )

    # Show the plot
    fig.show()
else:
    print("This graph is only available between 3 PM and 5 PM IST.")




This graph is only available between 3 PM and 5 PM IST.
