In [138]:
import pandas as pd
import numpy as np

In [139]:
df = pd.read_csv("Google-Playstore.csv")

In [140]:
# Drop rows with null values
for column in df.columns:
		df.drop(labels=df.index[df[column].isna()],inplace=True)

print(df.shape)

(1287191, 24)


In [141]:
# Drop rows with size or minimum android version = "Varies with device"
df.drop(labels=df.index[df["Size"] == "Varies with device"],inplace=True)
df.drop(labels=df.index[df["Minimum Android"] == "Varies with device"],inplace=True)

print(df.shape)

(1249667, 24)


In [142]:
# Convert all sizes to the same units
def normalize_size(size):
    nsize = float(size[:-1].replace(",", ""))
    if size[-1] == "G":
        return nsize*1024
    elif size[-1] == "k":
        return nsize/1024
    else:
        return nsize

df['Size'] = df['Size'].apply(normalize_size)

In [145]:
# Remove android version specifications with no. of apps < 1000 (outliers)
unique_versions = set(df["Minimum Android"])
version_cnts = {}
for s in unique_versions:
    version_cnts[s] = sum(df["Minimum Android"].str.count(s))

df["vcount"] = df["Minimum Android"].apply(lambda x: version_cnts[x])

df.drop(labels=df.index[df["vcount"] < 1000],inplace=True)

print(df.shape)

In [148]:
# Delete unnecessary columns
drop_columns = ["Developer Website", "Developer Email", "Scraped Time", "Privacy Policy", "vcount"]
df.drop(drop_columns, inplace=True, axis=1)

print(df.shape)

(1248021, 20)


In [162]:
df.to_csv("Google-Playstore-Preprocessed.csv")