**Forecasting with ARIMA Model**


Import Libraries

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

Load Dataset

In [53]:
df = pd.read_csv("googleplaystore.csv")

1. Data Cleaning and Preparation

# Handling missing values


In [54]:
df.dropna(inplace=True)

# Removing duplicates

In [55]:
df.drop_duplicates(inplace=True)

# Converting fields to numerical format

In [56]:
df['Reviews'] = df['Reviews'].astype(int)

# Convert 'Size' to numerical format

In [57]:
df['Size'] = df['Size'].replace('Varies with device', np.nan)
df['Size'] = df['Size'].astype(str)
df['Size'] = df['Size'].str.replace('M', '').str.replace('k', '').astype(float)
df['Size'] = df['Size'].fillna(df['Size'].mean())

# Remove non-numeric characters from 'Installs' column / Convert 'Installs' to numerical format

In [58]:
df['Installs'] = df['Installs'].astype(str)

df['Installs'] = df['Installs'].str.replace('[^\d]', '', regex=True)

df['Installs'] = df['Installs'].astype(int)




# Convert 'Price' to numerical format

In [59]:
df['Price'] = df['Price'].astype(str)

df['Price'] = df['Price'].str.replace('$', '')

df['Price'] = df['Price'].astype(float)


2. Exploratory Data Analysis (EDA)


# Distribution of app ratings

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Rating'], bins=20, kde=True)
plt.title('Distribution of App Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()


# Proportion of free vs. paid apps

In [None]:
plt.figure(figsize=(6, 6))
df['Type'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Proportion of Free vs. Paid Apps')
plt.ylabel('')
plt.show()

# Categories with the most apps

In [None]:
plt.figure(figsize=(12, 6))
df['Category'].value_counts().plot(kind='bar')
plt.title('Number of Apps in Each Category')
plt.xlabel('Category')
plt.ylabel('Number of Apps')
plt.xticks(rotation=45, ha='right')
plt.show()

 3. Category Analysis

# Highest average rating by category

In [63]:
avg_rating_by_category = df.groupby('Category')['Rating'].mean().sort_values(ascending=False)
print("Category with the Highest Average Rating:", avg_rating_by_category.idxmax())


Category with the Highest Average Rating: EVENTS


# Category with the most reviews

In [64]:
most_reviews_by_category = df.groupby('Category')['Reviews'].sum().sort_values(ascending=False)
print("Category with the Most Reviews:", most_reviews_by_category.idxmax())


Category with the Most Reviews: GAME


# Category with the most apps

In [65]:
most_apps_by_category = df['Category'].value_counts().idxmax()
print("Category with the Most Apps:", most_apps_by_category)

Category with the Most Apps: FAMILY


# Relationship between category and app size


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Category', y='Size')
plt.title('App Size Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Size (in MB)')
plt.xticks(rotation=45, ha='right')
plt.show()

# Relationship between category and app price

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Category', y='Price')
plt.title('App Price Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Price ($)')
plt.xticks(rotation=45, ha='right')
plt.show()


4. Rating Prediction Model

# Drop rows with missing values in relevant columns

In [68]:
df.dropna(subset=['Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Rating'], inplace=True)


# Split features and target variable

In [69]:
X = df[['Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating']]
y = df['Rating']

# Split data into training and testing sets

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess categorical features

In [71]:
categorical_features = ['Type', 'Content Rating']
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), categorical_features)], remainder='passthrough')
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Build and train the model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

5. Trend Analysis


# Convert 'Last Updated' column to datetime

In [None]:
df['Last Updated'] = pd.to_datetime(df['Last Updated'])

# Extract year from 'Last Updated' column

In [75]:
df['Year'] = df['Last Updated'].dt.year


# Plotting trend of top apps characteristics over time

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x='Year', y='Rating', label='Rating')
sns.lineplot(data=df, x='Year', y='Reviews', label='Reviews')
sns.lineplot(data=df, x='Year', y='Installs', label='Installs')
plt.title('Trend Analysis of Top Apps Characteristics Over Time')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend()
plt.show()

 6. Impact of Reviews on Ratings

# Calculate correlation between 'Reviews' and 'Rating'

In [None]:
corr, _ = pearsonr(df['Reviews'], df['Rating'])
print("Correlation between Reviews and Rating:", corr)

# Plot hexbin plot of 'Reviews' vs 'Rating'

In [None]:
plt.figure(figsize=(8, 6))
plt.hexbin(df['Reviews'], df['Rating'], gridsize=30, cmap='viridis')
plt.colorbar(label='count')
plt.title('Hexbin Plot of Reviews vs Rating')
plt.xlabel('Reviews')
plt.ylabel('Rating')
plt.show()