In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
color = sns.color_palette()
import warnings
warnings.filterwarnings("ignore")

In [1]:
data = pd.read_csv('playstore_analysis.csv')

NameError: name 'pd' is not defined

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

# 1. Data Cleanning – Missing value treatment
a. Drop records where rating is missing since rating is our target/study variable

In [None]:
print("Missing Values :")
data.isnull().sum()

In [None]:
data1=data.dropna(subset=['Rating'])

In [None]:
print("Missing Values :")
data1.isnull().sum()

b. Check the null values for the Android Ver column.

i. Are all 3 records having the same problem?

Yes

In [None]:
print(f"Missing Values : {data1['Android Ver'].isnull().sum()}")

In [None]:
data1[data1.isna().any(axis=1)]

ii. Drop the 3rd record i.e. record for “Life Made WIFI …”

In [None]:
data2 = data1.drop(10472)
# Verification
verify = data2[data2["App"]=='Life Made WI-Fi Touchscreen Photo Frame']
print (verify)

iii. Replace remaining missing values with the mode.

In [None]:
import statistics
Android_list = data2['Android Ver'].to_list()
#print(Android_list)
data3 = data2.fillna(statistics.mode(Android_list))

print(data3.loc[[4453]])
print('--------------------------------------------------------------------')
print(data3.loc[[4490]])

c. Current ver – replace with most common value

In [None]:
print(f"Missing Values :{data3['Current Ver'].isnull().sum()}")

# 2. Data clean up – correcting the data types
a. Which all variables need to be brought to numeric types?

In [None]:
data3['Reviews'] = data3['Reviews'].astype(int)
data3['Size'] = data3['Size'].astype(int)

b. Price variable – remove $ sign and convert to float

In [None]:
data3['Price'] = data3['Price'].str.replace('$', '')

In [None]:
data3.head(2)

In [None]:
data3['Price'] = data3['Price'].astype(float)

In [None]:
data3.info()

c. Installs – remove ‘,’ and ‘+’ sign, convert to integer

In [None]:
data3['Installs'] = data3['Installs'].str.replace('+','')

In [None]:
data3['Installs'] = data3['Installs'].str.replace(',', '')

In [None]:
data3['Installs'] = data3['Installs'].astype(int)

In [None]:
data3.head(2)

In [None]:
data3.info()

# 3. Sanity checks – check for the following and handle accordingly
a.  Avg. rating should be between 1 and 5, as only these values are allowed on the play store.

In [None]:
pd.unique(data3[['Rating']].values.ravel())
#.ravel()-->return 1D array with all the input-array elements

Yes, all Avg. rating values are in between 1 to 5

i. Are there any such records? Drop if so.

 No such record has been obesrved.

b. Reviews should not be more than installs as only those who installed can review the app.

In [None]:
df = pd.DataFrame()
df = data3[data3.Reviews <= data3.Installs]
df.head(5)

In [None]:
df.shape

# 4. Identify and handle outliers –
a. Price column

i. Make suitable plot to identify outliers in price

In [None]:
plt.subplots(figsize=(20,8))
sns.boxplot(df.Price)

plt.xlabel('Unit Price')
plt.title('Unit Price')
plt.grid()
plt.show()

ii. Do you expect apps on the play store to cost $200? Check out these cases

In [None]:
df1 = data3[data3['Price'] == 200]
print(df1)

iii. After dropping the useless records, make the suitable plot again to identify
outliers

In [None]:
df2 = df[df['Price'] != 0]

plt.subplots(figsize=(20,8))
sns.boxplot(df2.Price)

plt.xlabel('Unit Price')
plt.title('Unit Price')
plt.grid()
plt.show()

iv. Limit data to records with price < $30

In [None]:
limit_data = df[df['Price'] < 30]
limit_data.head()

In [None]:
limit_data.shape

b. Reviews column

i. Make suitable plot

In [None]:
total=df.groupby('Category')['Reviews'].sum().sort_values()
plt.subplots(figsize=(15,8))
total.plot(kind='barh', fontsize=14)
print(total.sort_values(ascending=False))
plt.show()

ii. Limit data to apps with < 1 Million reviews

In [None]:
limit_data = df[df['Reviews'] < 1000000]
limit_data.head(5)

In [None]:
limit_data.shape

c. Installs

i. What is the 95th percentile of the installs?

In [None]:
val = df.Installs.quantile(0.95) # 95th percentile
print(val)

ii. Drop records having a value more than the 95th percentile

In [None]:
# Verification
x = df.Installs.quantile() > val
print (x)

# Data analysis to answer business questions

5. What is the distribution of ratings like? (use Seaborn) More skewed towards higher/lower values?

In [None]:
sns.distplot(df['Rating'])
plt.show()

A left skewed distribution frequently referred as negatively skewed distribution as its long tail is falling on neg side and the mean is also on the left of the peak.

The three measures on the plot signify 

Mean less than mode

Median less than mode

Mean less than median

We can therefore identify most of the app rating belongs to the group of 4 to 5.

b. What is the implication of this on your analysis?

Real life distributions are usually skewed. If there are too much skewness in the data, then many statistical model don’t work.
So in skewed data, the tail region may act as an outlier for the statistical model and we know that outliers adversely affect the model’s performance especially regression-based models.
So there is a necessity to transform the skewed data to close enough to a Gaussian distribution or Normal distribution. This will allow us to try more number of statistical model.

Conclusion:
If we have a skewed data then it may harm our results. So, in order to use a skewed data we have to apply a log transformation over the whole set of values to discover patterns in the data and make it usable for the statistical model.

6.  What are the top Content Rating values?

In [None]:
df['Content Rating'].value_counts()

a. Are there any values with very few records?  Yes

'Unrated' Content Rating has value 1 and Adults only 18+

b. If yes, drop those as they won’t help in the analysis

In [None]:
Adult_rating = df[df['Content Rating'] == 'Adults only 18+'].index.to_list()
unrated =df[df['Content Rating'] == 'Unrated'].index.to_list()
df.drop(Adult_rating, inplace = True)
df.drop(unrated, inplace = True)
df['Content Rating'].value_counts()

7. Effect of size on rating

a.Make a joinplot to understand the effect of size on rating

In [None]:
sns.jointplot(x=df['Size'],y=df['Rating'],data=df,kind='hex')
plt.show()

b. Do you see any patterns? c. How do you explain the pattern?

No pattern although, This helped us to identify that the max points came along 4.0 to 4.5 and 40,960.

40,960 and above apps tends to get good ratings.

8.  Effect of price on rating

a. Make a jointplot (with regression line)

In [None]:
sns.jointplot(x ="Rating" , y = "Price" ,data = df)
plt.show()

b. What pattern do you see?

c. How do you explain the pattern?

increasing the price, Rating Increases

d. Replot the data, this time with only records with price > 0

In [None]:
Price_greaterthan_zero = df[df['Price'] > 0]
sns.jointplot(x ="Price" , y = "Rating" ,data = Price_greaterthan_zero, kind = "reg" )
plt.show()

In [None]:
sns.lmplot(x='Price', y='Rating', hue ='Content Rating', data=df)
plt.show()

f. What is your overall inference on the effect of price on the rating

Mature and Teen are more posively co-related in Content rating than 10+ and everyone


9. Look at all the numeric interactions together –

a. Make a pairplort with the colulmns - 'Reviews', 'Size', 'Rating', 'Price'

In [None]:
sns.pairplot(df,vars=['Rating','Size', 'Reviews', 'Price'])
plt.show()

10. Rating vs. content rating

a. Make a bar plot displaying the rating for each content rating

In [None]:
a = df['Rating'].groupby(df['Content Rating']).median().plot(kind = 'bar')
a.set(xlabel='Rating of content',ylabel='Average of rating')
plt.show()

b. Which metric would you use? Mean? Median? Some other quantile?

I Would prefer Median over others.

c. Choose the right metric and plot

In [None]:
df.groupby(['Content Rating'])['Rating'].count().plot.bar(color="b")
plt.ylabel('Rating')
plt.show()

11. Content rating vs. size vs. rating – 3 variables at a time

a. Create 5 buckets (20% records in each) based on Size

In [None]:
#Checking skewness
sns.distplot(df["Size"], bins=5)
plt.show()

In [None]:
df['sb'] = pd.qcut(df['Size'], q=5)

In [None]:
df.sb.value_counts()

In [None]:
bins=[7.9, 5100, 14000, 21516, 34000, 100000]
df['Size_Buckets'] = pd.cut(df['Size'], bins, labels=['VERY LOW','LOW','MED','HIGH','VERY HIGH'])
pd.pivot_table(df, values='Rating', index='Size_Buckets', columns='Content Rating')

b. By Content Rating vs. Size buckets, get the rating (20th percentile) for each combination

In [None]:
df.Size.quantile([0.2, 0.4,0.6,0.8])

In [None]:
df.Rating.quantile([0.2, 0.4,0.6,0.8])

c. Make a heatmap of this

In [None]:
Size_Buckets =pd.pivot_table(df, values='Rating', index='Size_Buckets', columns='Content Rating', 
                     aggfunc=lambda x:np.quantile(x,0.2))
Size_Buckets

In [None]:
Size_Buckets =pd.pivot_table(df, values='Size_Buckets', index='Content Rating', columns='Rating', 
                     aggfunc=lambda x:np.quantile(x,0.2))
Size_Buckets

i. Annotated

In [None]:
sns.heatmap(Size_Buckets, annot = True)
plt.show()

ii. Greens color map

In [None]:
sns.heatmap(Size_Buckets, annot=True, cmap='Greens')
plt.show()

d. What’s your inference? Are lighter apps preferred in all categories? Heavier? Some?

Although on the otherhand when it comes to Mature 17+ category which are in less storage size, the number game is weak w.r.t rating as they are less preferred.

Apps which are in larger storage size are choosen for all types of content rating,but that number is adequate not huge.
Although on the otherhand when it comes to Mature 17+ category which are in less storage size, the number game is weak w.r.t rating as they are less preferred.