In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv("Google-Playstore.csv")

In [15]:
# Drop rows with null values
for column in df.columns:
		df.drop(labels=df.index[df[column].isna()],inplace=True)

print(df.shape)

(1287191, 24)


In [16]:
# Drop rows with size or minimum android version = "Varies with device"
df.drop(labels=df.index[df["Size"] == "Varies with device"],inplace=True)
df.drop(labels=df.index[df["Minimum Android"] == "Varies with device"],inplace=True)

print(df.shape)

(1249667, 24)


In [17]:
# Convert all sizes to the same units
def normalize_size(size):
    nsize = float(size[:-1].replace(",", ""))
    if size[-1] == "G":
        return nsize*1024
    elif size[-1] == "k":
        return nsize/1024
    else:
        return nsize

df['Size'] = df['Size'].apply(normalize_size)

In [18]:
# Remove android version specifications with no. of apps < 1000 (outliers)
unique_versions = set(df["Minimum Android"])
version_cnts = {}
for s in unique_versions:
    version_cnts[s] = sum(df["Minimum Android"].str.count(s))

df["vcount"] = df["Minimum Android"].apply(lambda x: version_cnts[x])

df.drop(labels=df.index[df["vcount"] < 1000],inplace=True)

print(df.shape)

(1248021, 25)


In [19]:
# Delete unnecessary columns
drop_columns = ["Developer Website", "Developer Email", "Scraped Time", "Privacy Policy", "vcount", "Developer Id"]
df.drop(drop_columns, inplace=True, axis=1)

print(df.shape)

(1248021, 19)


In [20]:
df.to_csv("Google-Playstore-Preprocessed.csv")

In [21]:
df

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,Currency,Size,Minimum Android,Released,Last Updated,Content Rating,Ad Supported,In App Purchases,Editors Choice
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0.0,10+,10.0,15,True,0.0,USD,10.0,7.1 and up,"Feb 26, 2020","Feb 26, 2020",Everyone,False,False,False
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64.0,"5,000+",5000.0,7662,True,0.0,USD,2.9,5.0 and up,"May 21, 2020","May 06, 2021",Everyone,True,False,False
4,GROW.me,com.horodyski.grower,Tools,0.0,0.0,100+,100.0,478,True,0.0,USD,6.2,4.1 and up,"Feb 21, 2020","Nov 12, 2018",Everyone,False,False,False
5,IMOCCI,com.imocci,Social,0.0,0.0,50+,50.0,89,True,0.0,USD,46.0,6.0 and up,"Dec 24, 2018","Dec 20, 2019",Teen,False,True,False
9,Neon 3d Iron Tech Keyboard Theme,com.ikeyboard.theme.neon_3d.iron.tech,Personalization,4.7,820.0,"50,000+",50000.0,62433,True,0.0,USD,3.5,4.1 and up,"Sep 22, 2019","Oct 07, 2020",Everyone,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2312933,Caustic Editor for VolcaSample,com.singlecellsoftware.kvsampler,Music & Audio,4.0,344.0,"500,000+",500000.0,814548,True,0.0,USD,4.1,2.2 and up,"Dec 11, 2014","Dec 11, 2014",Everyone,False,False,False
2312934,Vietnamese - English Translator,com.eliminatesapps.vietnamesetranslator,Education,0.0,0.0,5+,5.0,6,True,0.0,USD,3.6,4.0 and up,"Jun 15, 2020","Aug 31, 2020",Everyone,True,False,False
2312938,Lero TOEFL Recorder + Timer,com.toefltimer,Education,3.4,17.0,"1,000+",1000.0,1980,True,0.0,USD,10.0,4.1 and up,"May 22, 2018","Dec 14, 2018",Everyone,True,False,False
2312940,ORU Online,com.threedream.oruonline,Education,0.0,0.0,100+,100.0,430,True,0.0,USD,44.0,4.1 and up,"Jan 17, 2018","Feb 02, 2018",Everyone,False,False,False


In [22]:
df['Last Updated']

0          Feb 26, 2020
1          May 06, 2021
4          Nov 12, 2018
5          Dec 20, 2019
9          Oct 07, 2020
               ...     
2312933    Dec 11, 2014
2312934    Aug 31, 2020
2312938    Dec 14, 2018
2312940    Feb 02, 2018
2312942    May 05, 2021
Name: Last Updated, Length: 1248021, dtype: object

In [26]:
for col1, col2 in zip(df['Minimum Installs'], df['Installs']):
    cleaned_Installs = cleaned_string = ''.join(filter(str.isdigit, col2))
    if(int(col1) != int(cleaned_Installs)):
        print(col1, col2)

# This proves that the columns 'Installs' and 'Minimum installs' convey the same information. Thus, we decided to drop the column 'Installs'.

In [27]:
df.drop(['Installs'], inplace=True, axis=1)
df

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Minimum Installs,Maximum Installs,Free,Price,Currency,Size,Minimum Android,Released,Last Updated,Content Rating,Ad Supported,In App Purchases,Editors Choice
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0.0,10.0,15,True,0.0,USD,10.0,7.1 and up,"Feb 26, 2020","Feb 26, 2020",Everyone,False,False,False
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64.0,5000.0,7662,True,0.0,USD,2.9,5.0 and up,"May 21, 2020","May 06, 2021",Everyone,True,False,False
4,GROW.me,com.horodyski.grower,Tools,0.0,0.0,100.0,478,True,0.0,USD,6.2,4.1 and up,"Feb 21, 2020","Nov 12, 2018",Everyone,False,False,False
5,IMOCCI,com.imocci,Social,0.0,0.0,50.0,89,True,0.0,USD,46.0,6.0 and up,"Dec 24, 2018","Dec 20, 2019",Teen,False,True,False
9,Neon 3d Iron Tech Keyboard Theme,com.ikeyboard.theme.neon_3d.iron.tech,Personalization,4.7,820.0,50000.0,62433,True,0.0,USD,3.5,4.1 and up,"Sep 22, 2019","Oct 07, 2020",Everyone,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2312933,Caustic Editor for VolcaSample,com.singlecellsoftware.kvsampler,Music & Audio,4.0,344.0,500000.0,814548,True,0.0,USD,4.1,2.2 and up,"Dec 11, 2014","Dec 11, 2014",Everyone,False,False,False
2312934,Vietnamese - English Translator,com.eliminatesapps.vietnamesetranslator,Education,0.0,0.0,5.0,6,True,0.0,USD,3.6,4.0 and up,"Jun 15, 2020","Aug 31, 2020",Everyone,True,False,False
2312938,Lero TOEFL Recorder + Timer,com.toefltimer,Education,3.4,17.0,1000.0,1980,True,0.0,USD,10.0,4.1 and up,"May 22, 2018","Dec 14, 2018",Everyone,True,False,False
2312940,ORU Online,com.threedream.oruonline,Education,0.0,0.0,100.0,430,True,0.0,USD,44.0,4.1 and up,"Jan 17, 2018","Feb 02, 2018",Everyone,False,False,False
