In [2]:
import pandas as pd # type: ignore
import json
import re
import numpy as np
import warnings
from datetime import datetime

### Import the data and make it workable

In [3]:
# append_paths = [f'../data/raw/by_pages/page_{page_num}_apps.json' for page_num in range(0,3000)]
# apps = []
# for page_path in append_paths:
#     page = json.loads(open(page_path, 'r').read())
#     apps.extend(page)
# with open('workable_apps.json','w')as f:
#     json.dump(apps, f, indent=4)


In [4]:
with open('workable_apps.json','r')as f:
    apps = json.loads(f.read())


In [5]:
df = pd.DataFrame([app for app in apps if app])
df.shape

(16463, 13)

In [6]:
n_na = df.isna().sum(axis=0)
n_na.round(2)  # Percentage of missing values

name                   0
category               0
developer           1064
release              138
setup_size             5
setup_type             7
compatibility        103
operating_system     228
ram_required         228
hdd_space            228
cpu                  228
desc                   0
features               0
dtype: int64

In [7]:
df.head(2)

Unnamed: 0,name,category,developer,release,setup_size,setup_type,compatibility,operating_system,ram_required,hdd_space,cpu,desc,features
0,EssentialPIM Pro Business 2024,Management,EssentialPIM Pro,22th Jun 2024,32 MB,Offline Installer / Full Standalone Setup,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,200 MB,Intel Dual Core or higher processor,EssentialPIM Pro Business 2024 is a profession...,"Allows you to create a wide range of tasks, no..."
1,Gillmeister Rename Expert 2024,Utilities,Gillmeister,22th Jun 2024,15 MB,Offline Installer / Full Standalone Setup,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,100 MB,Intel Dual Core or higher processor,Gillmeister Rename Expert 2024 is a profession...,Allows you to edit the names of files and fold...


### We can clean the columns with units example: hdd_space, ram, setup size, and also cpu

#### These are some cleaning functions

In [8]:
def parse_memory(x: str):
   try:
      # Extract the number part from the string
      number = re.search(r'\d+(\.\d+)?', x).group()
      number = float(number)  # Convert the extracted number to a float
   except (AttributeError, ValueError, TypeError):
      return np.nan  # Return NaN if no valid number is found

   # Determine the unit and convert accordingly
   if re.search(r'(MB|mb)', x):
      return number
   elif re.search(r'(KB|kb)', x):
      return number / 1000
   elif re.search(r'(GB|gb)', x):
      return number * 1000
   return np.nan 

def parse_compatibility(x: str):
   try:
      matches =  re.findall(r'(32|64|86)', x)
   except TypeError:
      return np.nan
   if matches:
      if '86' in matches:
         matches.remove('86')
         matches.insert(0, '32')
      matches = list(set(matches))
      matches = [int(num) for num in matches]
      return min(matches)
   else:
      return np.nan
   
def parse_date(date):
    try:
        # Remove 'rd', 'th', 'st' from the day part
        cleaned_date_str = re.sub(r'(st|nd|rd|th)', '', date)

        # Parse the cleaned date string
        date_obj = datetime.strptime(cleaned_date_str, "%d %b %Y")
        return date_obj.strftime('%d-%m-%Y')
    except (ValueError, TypeError):
        return np.nan

In [9]:
newdf = df.copy()

In [10]:
newdf['setup_size_MB'] = newdf['setup_size'].map(parse_memory)

In [11]:
newdf['hdd_space_MB'] = newdf['hdd_space'].map(parse_memory)

In [12]:
newdf['ram_required_MB'] = newdf['ram_required'].map(parse_memory)

In [13]:
newdf['minimum_os_bits']  = newdf['compatibility'].map(parse_compatibility)

In [14]:
newdf['release'] = newdf['release'].map(parse_date)

In [15]:
newdf = newdf.drop('setup_type', axis=1)

In [16]:

newdf


Unnamed: 0,name,category,developer,release,setup_size,compatibility,operating_system,ram_required,hdd_space,cpu,desc,features,setup_size_MB,hdd_space_MB,ram_required_MB,minimum_os_bits
0,EssentialPIM Pro Business 2024,Management,EssentialPIM Pro,22-06-2024,32 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,200 MB,Intel Dual Core or higher processor,EssentialPIM Pro Business 2024 is a profession...,"Allows you to create a wide range of tasks, no...",32.0,200.0,1000.0,32.0
1,Gillmeister Rename Expert 2024,Utilities,Gillmeister,22-06-2024,15 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,100 MB,Intel Dual Core or higher processor,Gillmeister Rename Expert 2024 is a profession...,Allows you to edit the names of files and fold...,15.0,100.0,1000.0,32.0
2,iToolab WatsG 2024 Free Download,File Sharing,iToolab,22-06-2024,72 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/8.1/10/11.,512 MB of RAM required.,250 MB of free space required.,Intel Dual Core or higher processor.,iToolab WatsG 2024 is a fantastic file sharing...,Easily transfer WhatsApp data from Android and...,72.0,250.0,512.0,32.0
3,ProPresenter 2024,Multimedia,ProPresenter,22-06-2024,80 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,100 MB,Intel Dual Core or higher processor,ProPresenter 2024 is a powerful application wh...,Allows you to view and watch live events in ex...,80.0,100.0,1000.0,32.0
4,DbVisualizer Pro 2024,"Database, Development",DbVisualizer Pro,22-06-2024,209 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,2 GB,1 GB,Intel Dual Core or higher processor,DbVisualizer Pro 2024 is a professional databa...,"Allows database developers to create, edit, up...",209.0,1000.0,2000.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16458,Windows 8.1 / Win 8 / Win 7 / Vista / XP,Office Tools,,,200 MB,,Nitro PDF Pro 9.0.5.9,"55 MB ( 32 bit), 64 MB (64 bit)",3rd Feb 2014,NitroPDF,Ever wondered how quality professional level P...,PDF Creation Support. High End Security in Dig...,200.0,,55.0,
16459,FL Studio 11 Producer Edition,Music,,,293 MB,ImageLine FL Studio,Windows 8.1 / Win 8 / Win 7 / XP / Vista,1 GB,1 GB,Intel Pentium IV 2 GHz,Ever wondered which softwares are used for pro...,Plugin Picker Multi-Track Audio Recording Mult...,293.0,1000.0,1000.0,
16460,Fraps 3.5.99,Utilities,,,2.5 MB,2nd Feb 2014,,,,,Realtime video capturing in games was never so...,Operating Systems: Windows 8.1 / Windows 8 / W...,2.5,,,
16461,SolidWorks Premium Edition 2014 SP 1.0 (64/32 ...,3D CAD,,,"7 GB (32 bit), 7.7 GB (64 bit)",SolidWorks,Windows 8.1 / Win 8 / Win 7 / Vista,2 GB,6.3 GB,Dual Core 1.6 Ghz,There are a lot of enterprise level 3D CAD des...,Latest 3D CAD Designing techniques. Powerful S...,7000.0,6300.0,2000.0,


In [18]:
newdf.dropna()

Unnamed: 0,name,category,developer,release,setup_size,compatibility,operating_system,ram_required,hdd_space,cpu,desc,features,setup_size_MB,hdd_space_MB,ram_required_MB,minimum_os_bits
0,EssentialPIM Pro Business 2024,Management,EssentialPIM Pro,22-06-2024,32 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,200 MB,Intel Dual Core or higher processor,EssentialPIM Pro Business 2024 is a profession...,"Allows you to create a wide range of tasks, no...",32.00,200.0,1000.0,32.0
1,Gillmeister Rename Expert 2024,Utilities,Gillmeister,22-06-2024,15 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,100 MB,Intel Dual Core or higher processor,Gillmeister Rename Expert 2024 is a profession...,Allows you to edit the names of files and fold...,15.00,100.0,1000.0,32.0
2,iToolab WatsG 2024 Free Download,File Sharing,iToolab,22-06-2024,72 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/8.1/10/11.,512 MB of RAM required.,250 MB of free space required.,Intel Dual Core or higher processor.,iToolab WatsG 2024 is a fantastic file sharing...,Easily transfer WhatsApp data from Android and...,72.00,250.0,512.0,32.0
3,ProPresenter 2024,Multimedia,ProPresenter,22-06-2024,80 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,100 MB,Intel Dual Core or higher processor,ProPresenter 2024 is a powerful application wh...,Allows you to view and watch live events in ex...,80.00,100.0,1000.0,32.0
4,DbVisualizer Pro 2024,"Database, Development",DbVisualizer Pro,22-06-2024,209 MB,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,2 GB,1 GB,Intel Dual Core or higher processor,DbVisualizer Pro 2024 is a professional databa...,"Allows database developers to create, edit, up...",209.00,1000.0,2000.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16452,CentOS 6.5,Operating Systems,Open Source,07-03-2014,649 MB,32 Bit (x86) / 64 Bit (x64),HP-UX 11i with the latest codes and upgrades,1GB of RAM required.,2GB space required.,Pentium III,CentOS is a Linux based open source program fr...,Open source. Reliable. Simple installation. Ea...,649.00,2000.0,1000.0,32.0
16453,Storm Codec 7.01.19,Multimedia,Freeware,06-03-2014,24 MB,32 Bit (x86) / 64 Bit (x64),Windows XP/Vista/Windows 7 and 8,256MB of RAM required,100MB of space needed.,Pentium III,While watching movies and other videos on your...,Contains loads of codecs. Easy to use. Quick i...,24.00,100.0,256.0,32.0
16454,Adobe InDesign CC 9.2 MultiLingual,Graphic Design,Adobe InDesign,05-03-2014,1.44 GB,32 Bit (x86) / 64 Bit (x64),Windows 2000/XP/Vista/7 and 8,2GB RAM Required,2GB space required,Intel P-IV,"With Adobe InDesign, interactive documents can...","E Books, Brochures can be made easily. Easy to...",1440.00,2000.0,2000.0,32.0
16455,AVG PC TuneUp 2014,System Tuning,AVG,02-03-2014,74.72 MB,32 Bit (x86) / 64 Bit (x64),Win 8.1 / Win 8 / Win 7 / Vista / XP,512 MB,100 MB,Intel Pentium IV,Well there are couple of other software alread...,Cleaning of Junk Files. Boost Speed like a Fre...,74.72,100.0,512.0,32.0
