In [210]:
import pandas as pd
from pandas import option_context
import numpy as np

import re
import string

import seaborn as sns
import matplotlib.pyplot as plt

### This is the main notebook for cleaning the app dataframe

In [211]:
df = pd.read_csv('04-data/scraped_app_data.csv')
df.info()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46031 entries, 0 to 46030
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     46031 non-null  object 
 1   description               46025 non-null  object 
 2   descriptionHTML           46025 non-null  object 
 3   summary                   46009 non-null  object 
 4   summaryHTML               46009 non-null  object 
 5   installs                  46021 non-null  object 
 6   minInstalls               46021 non-null  float64
 7   score                     45932 non-null  float64
 8   ratings                   45877 non-null  float64
 9   reviews                   45932 non-null  float64
 10  histogram                 46021 non-null  object 
 11  price                     46021 non-null  float64
 12  free                      46021 non-null  object 
 13  currency                  46021 non-null  object 
 14  sale  

## Clean dataframe

In [212]:
df = df.drop(columns = ['descriptionHTML',
                       'summaryHTML',
                       'recentChangesHTML',
                       'developerEmail',
                       'developerWebsite',
                       'developerAddress',
                       'saleTime',
                       'originalPrice',
                       'saleText'])

In [213]:
# Drop duplicates
df = df.drop_duplicates(subset=['appId', 'title'], keep='last')

In [214]:
# Remove rows that do not have a value for ratings / and date released
df = df[df['ratings'].notna()]

df = df[df['released'].notna()]

In [215]:
# Remove non-english apps

# By appID
df = df[df['appId'] != 'com.syc.librototal.El_Libro_Total']

# By app title
app_titles = ['Frases de Maloka', 'Dzienniczek VULCAN',
             'FinalCam', 'COCOA - COVID-19 Contact App', 
             'Home Quarantine - Poland (Kwarantanna domowa)'
             'Smartschool', 'Apa Doanya: Doa & Dzikir',
             'Afore Coppel','VTC Now - Tin nhanh & sự kiện trực tiếp',
             'Webmotors - comprar e vender veículos ou motos',
             'Me Salva! ENEM 2020: Aulas Offline e Exercícios',
             'facilisimo','bau cua 2021', 'Hoy','e-Okul Öğretmen',
             'Audio cuentos infantiles cortos','Comedy Radio','TV3',
             'BFV','SovietCar: Simulator','KICASignPlus',
             'Tu dien Trung Viet','GollerCepte 1967','Edenred Wallet',
             'Anadolu Mobil','Help Steps','Old Pregnancy Gestogram',
             'TV24', 'Le Figaro.fr: Actu en direct', 'my MEO',
             'Santa Biblia Gratis', 'UBEAT', 'Bci', 'Onedio',
             'Apteka.RU','TRT Kare','Kral','Maastokartat','iPasen',
             'BetterWare','ebebek','Merhaba Umut','Anadolu Mobil',
             'islam sobhi quran offline', 'TRT Hayri Uzayda', 'Mein ELBA-App',
             'MEO Drive', 'TRT Kolay Gelsin','Fahrschulcard',
             'Km de Vantagens', 'Namaz Rehberi',
             'Grammaire Cours et Exercices (sans internet)', 
             'Gol Caracol', 'TRT Haber','Merhaba Umut','App SKY',
             't-online - Nachrichten','Querida Ansiedade','Kanal D', 
             'Querida Ansiedade']
        
df = df[~df['title'].isin(app_titles)]

# by developer name
developer_names = ['Boursorama','Dawat-e-Islami','Aplicaciones Cristianas','Nakagosoft, Bangladesh']

df = df[~df['developer'].isin(developer_names)]

In [216]:
# Year of release
df['year'] = pd.DatetimeIndex(df['released']).year

In [217]:
# Clean description column
df['description'] = df['description'].astype(str)

def clean_text(text):
    '''Make text lowercase, remove punctuation, remove emojis, etc'''
    text = text.lower()
    text = re.sub(r"<[^>]*>", "", text)
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub("[^a-zA-Z0-9 -]","",text)
    
    return text

df['description_clean'] = df.description.map(clean_text)

In [218]:
# Clean star historgram and separate into new columns
df['hist_clean'] = df['histogram'].apply(lambda x: x[1:-1].split(','))
df['hist_clean'] = df['hist_clean'].apply(lambda y: [int(x) for x in y])

In [219]:
df[['star_1','star_2','star_3','star_4','star_5']] = pd.DataFrame(df.hist_clean.tolist(), index= df.index)

In [220]:
# Change data type to bool for analysis 
df['editorsChoice'] = df['editorsChoice'].astype(str)
df['editorsChoice'] = np.where(df['editorsChoice'] == 'True',1,0)

df['free'] = df['free'].astype(str)
df['free'] = np.where(df['free'] == 'True',1,0)

df['containsAds'] = df['containsAds'].astype(str)
df['containsAds'] = np.where(df['containsAds'] == 'True',1,0)

In [221]:
# Reassign a few apps
df.loc[(df['contentRating'] == 'Adults only 18+') |(df['contentRating'] == 'Unrated'),
      ['contentRating']] = 'Mature 17+'

df.groupby('contentRating').count()

Unnamed: 0_level_0,title,description,summary,installs,minInstalls,score,ratings,reviews,histogram,price,...,appId,url,year,description_clean,hist_clean,star_1,star_2,star_3,star_4,star_5
contentRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Everyone,35044,35044,35029,35044,35044,35044,35044,35044,35044,35044,...,35042,35042,35044,35044,35044,35044,35044,35044,35044,35044
Everyone 10+,2008,2008,2008,2008,2008,2008,2008,2008,2008,2008,...,2008,2008,2008,2008,2008,2008,2008,2008,2008,2008
Mature 17+,1730,1730,1729,1730,1730,1730,1730,1730,1730,1730,...,1730,1730,1730,1730,1730,1730,1730,1730,1730,1730
Teen,6754,6754,6754,6754,6754,6754,6754,6754,6754,6754,...,6753,6753,6754,6754,6754,6754,6754,6754,6754,6754


## Feature engineering

In [222]:
# Create aggregate view to find top developer by avg score per app
df_agg = df[['developer','minInstalls','score']]

df_agg['count'] = 1

df_agg = df_agg.groupby(['developer'])['minInstalls','count','score'].sum()
df_agg = pd.DataFrame(df_agg).reset_index()

df_agg['avg_installs'] = df_agg['minInstalls'] / df_agg['count']
df_agg['avg_score'] = df_agg['score'] / df_agg['count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_agg['count'] = 1
  df_agg = df_agg.groupby(['developer'])['minInstalls','count','score'].sum()


In [223]:
df_top = df_agg.sort_values('avg_score',ascending=False).head(1000)
unique_ids = df_top['developer'].tolist()

df['top_developer'] = np.where(df['developer'].isin(unique_ids),1,0)

In [224]:
# Create a column for number of days since release
df['current_date'] = '2021-03-03'

df['days'] = (pd.DatetimeIndex(df['current_date']) - pd.DatetimeIndex(df['released'])).days

# Using minInstalls and days since release, create variable for average installs per day
df['installs_day'] = df['minInstalls']/ df['days']

In [225]:
# Does having a video to showcase the app make it more attractive? Create feature to capture video (yes / no)
df['has_video'] = np.where(df['video'].isna(),0,1)

df.groupby('has_video')['score'].mean()

has_video
0    4.025834
1    4.118348
Name: score, dtype: float64

In [226]:
df.to_csv('04-data/preprocessed_app_data.csv',index=False)

In [227]:
df.sort_values('score',ascending=True).head(60)

Unnamed: 0,title,description,summary,installs,minInstalls,score,ratings,reviews,histogram,price,...,star_1,star_2,star_3,star_4,star_5,top_developer,current_date,days,installs_day,has_video
42841,Face Symmetry,Check the symmetry of the face.\r\n\r\n - Rig...,Check the symmetry of the face.,"500,000+",500000.0,1.100917,1092.0,334.0,"[1052, 10, 0, 20, 10]",0.0,...,1052,10,0,20,10,0,2021-03-03,2639,189.465707,0
39131,Thingiverse,MakerBot's Thingiverse is a thriving design co...,Thingiverse: a thriving community for discover...,"500,000+",500000.0,1.108673,9435.0,5922.0,"[9063, 98, 78, 19, 177]",0.0,...,9063,98,78,19,177,0,2021-03-03,2596,192.604006,0
33210,Owl - Predictor Mania,Our application will reveal all the secrets an...,How will you look like at 18 ?,"500,000+",500000.0,1.297052,4370.0,1880.0,"[3936, 89, 69, 39, 237]",0.0,...,3936,89,69,39,237,0,2021-03-03,832,600.961538,0
6137,Ikariam Mobile,"As ruler of an island kingdom, you build up gr...","Construct, fight and research: let your island...","1,000,000+",1000000.0,1.324779,57642.0,18459.0,"[50837, 1859, 1149, 629, 3168]",0.0,...,50837,1859,1149,629,3168,0,2021-03-03,2794,357.909807,1
39155,Wink - Smart Home,Wink is the quick and simple way to connect yo...,The app that connects you and the products you...,"500,000+",500000.0,1.343005,9436.0,5642.0,"[8323, 264, 117, 195, 537]",0.0,...,8323,264,117,195,537,0,2021-03-03,2697,185.391175,1
27958,DroiHealth,DroiHealth is a health category application th...,A health category application that record the ...,"500,000+",500000.0,1.345238,3318.0,2218.0,"[2944, 69, 49, 49, 207]",0.0,...,2944,69,49,49,207,0,2021-03-03,770,649.350649,0
31120,Smartschool,De Smartschool App is speciaal ontwikkeld voor...,Always along with what is happening at your sc...,"1,000,000+",1000000.0,1.349434,13155.0,3901.0,"[11787, 109, 168, 218, 873]",0.0,...,11787,109,168,218,873,0,2021-03-03,2011,497.265042,0
383,"TV Guide: Best Shows & Movies, Streaming & Liv...",TV Guide is the place to go for finding what t...,"Discover new series & films to stream, see cha...","1,000,000+",1000000.0,1.362267,43648.0,21819.0,"[36554, 3168, 1148, 768, 2010]",0.0,...,36554,3168,1148,768,2010,0,2021-03-03,2858,349.895031,1
30373,Home Quarantine - Poland (Kwarantanna domowa),<b>Home Quarantine – Poland (Kwarantanna domow...,Polish Government’s app. It confirms location ...,"1,000,000+",1000000.0,1.402204,14224.0,10558.0,"[12188, 450, 450, 176, 960]",0.0,...,12188,450,450,176,960,0,2021-03-03,349,2865.329513,0
7583,CBS All Access,Stream full episodes of your favorite CBS show...,"New hit series just added, like SpongeBob Squa...","1,000,000+",1000000.0,1.44,117.0,109.0,"[89, 16, 7, 2, 3]",0.0,...,89,16,7,2,3,0,2021-03-03,1997,500.751127,0
