# App Data
The following features will be added to the appdata dataset:

| # | name            | definition                                       | dtype (pandas) | dtype (mysql) |
|---|-----------------|--------------------------------------------------|----------------|---------------|
| 1 | extracted       | Date the data were extracted from the appvoc   | datetime       | VARCHAR(32)   |
| 2 | free            | Indicates whether the app is free                | bool           | TINYINT       |
| 3 | avail           | Number of days between release and extract dates | np.int64       | BIGINT        |
| 4 | ratings_per_day | Number of ratings divided by the avail           | np.float64     | FLOAT         |


In [1]:
import os
import numpy as np
from datetime import datetime
from appvoc.infrastructure.file.io import IOService


In [2]:
filepath = "data/raw/appdata_2023-08-28T132303.pkl"
df = IOService.read(filepath=filepath)

In [3]:
df['extracted'] = datetime(2023,7,31,5,00,00)
df.loc[df['price'] == 0, 'free'] = True
df.loc[df['price'] != 0, 'free'] = False
df['released'] = pd.to_datetime(df['released'])
df['months_avail'] = (df['extracted'] - df['released']) / np.timedelta64(1,'M')
df['ratings_per_month'] = df['ratings'] / df['months_avail']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 475132 entries, 0 to 475131
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   id                 475132 non-null  string        
 1   name               475132 non-null  string        
 2   description        475132 non-null  string        
 3   category_id        475132 non-null  category      
 4   category           475132 non-null  category      
 5   price              475132 non-null  float64       
 6   developer_id       475132 non-null  string        
 7   developer          475132 non-null  string        
 8   rating             475132 non-null  float64       
 9   ratings            475132 non-null  int64         
 10  released           440837 non-null  datetime64[ns]
 11  extracted          475132 non-null  datetime64[ns]
 12  free               475132 non-null  object        
 13  months_avail       440837 non-null  float64 

In [4]:
df.head()

Unnamed: 0,id,name,description,category_id,category,price,developer_id,developer,rating,ratings,released,extracted,free,months_avail,ratings_per_month
0,6446790238,Mood Tracker :,Understanding and managing your emotions is cr...,6013,Health & Fitness,0.0,1436021751,CARECLINIC SOFTWARE INC.,5.0,6,2023-04-18 07:00:00,2023-07-31 05:00:00,True,3.35,1.79
1,6447916914,Empire App Virtual,"With the Empire App, you can: -join our gym -...",6013,Health & Fitness,0.0,1603858924,Body Evolution Warfighter LLC,0.0,0,2023-04-21 07:00:00,2023-07-31 05:00:00,True,3.26,0.0
2,6448082493,CrossFit 926,For members of CrossFit 926 to reserve their p...,6013,Health & Fitness,0.0,688595778,PushPress,0.0,0,2023-04-21 07:00:00,2023-07-31 05:00:00,True,3.26,0.0
3,6447812886,Shoreline CrossFit,For members of Shoreline CrossFit to reserve t...,6013,Health & Fitness,0.0,688595778,PushPress,0.0,0,2023-04-20 07:00:00,2023-07-31 05:00:00,True,3.29,0.0
4,1631374974,myAxonics,Find relief from your bladder and bowel contro...,6013,Health & Fitness,0.0,1631374976,Axonics Modulation Technologies,0.0,0,2023-04-21 07:00:00,2023-07-31 05:00:00,True,3.26,0.0


In [5]:
filename = "appdata_processed_" + datetime.now().strftime("%Y-%m-%d") + ".pkl"
directory = "data/processed"
filepath = os.path.join(directory, filename)
IOService.write(filepath=filepath, data=df)