In [250]:
import pandas as pd # type: ignore
import json
import re
import numpy as np
import warnings

### Import the data and make it workable

In [175]:
append_paths = [f'../data/raw/by_pages/page_{page_num}_apps.json' for page_num in range(0,3000)]
apps = []
for page_path in append_paths:
    page = json.loads(open(page_path, 'r').read())
    apps.extend(page)
with open('workable_apps.json','w')as f:
    json.dump(apps, f, indent=4)


#### Now import the workable json file

In [220]:
df = pd.DataFrame([app for app in apps if app])
df.shape

(16463, 13)

In [221]:
n_na = df.isna().sum(axis=0)
n_na.round(2)  # Percentage of missing values

name                   0
category               0
developer           1064
release              138
setup_size             5
setup_type             7
compatibility        103
operating_system     228
ram_required         228
hdd_space            228
cpu                  228
desc                   0
features               0
dtype: int64

In [222]:
df = df.dropna()

In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15388 entries, 0 to 16456
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              15388 non-null  object
 1   category          15388 non-null  object
 2   developer         15388 non-null  object
 3   release           15388 non-null  object
 4   setup_size        15388 non-null  object
 5   setup_type        15388 non-null  object
 6   compatibility     15388 non-null  object
 7   operating_system  15388 non-null  object
 8   ram_required      15388 non-null  object
 9   hdd_space         15388 non-null  object
 10  cpu               15388 non-null  object
 11  desc              15388 non-null  object
 12  features          15388 non-null  object
dtypes: object(13)
memory usage: 1.6+ MB


In [224]:
df.head(2)

Unnamed: 0,name,category,developer,release,setup_size,setup_type,compatibility,operating_system,ram_required,hdd_space,cpu,desc,features
0,EssentialPIM Pro Business 2024,Management,EssentialPIM Pro,22th Jun 2024,32 MB,Offline Installer / Full Standalone Setup,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,200 MB,Intel Dual Core or higher processor,EssentialPIM Pro Business 2024 is a profession...,"Allows you to create a wide range of tasks, no..."
1,Gillmeister Rename Expert 2024,Utilities,Gillmeister,22th Jun 2024,15 MB,Offline Installer / Full Standalone Setup,32 Bit (x86) / 64 Bit (x64),Windows 7/8/10,1 GB,100 MB,Intel Dual Core or higher processor,Gillmeister Rename Expert 2024 is a profession...,Allows you to edit the names of files and fold...


### We can clean the columns with units example: hdd_space, ram, setup size, and also cpu

In [290]:
def parse_memory(x:str):
   try:
      number = re.search(r'\d+(\.\d+)?', x).group()
   except AttributeError:
      return np.nan
   try:
      number = int(number)
   except ValueError:
      number = float(number)
   if re.search(r'(?:MB)',x, flags=re.IGNORECASE):
      return number
   elif re.search(r'(?:KB)',x, flags=re.IGNORECASE):
      return number /1000
   elif re.search(r'(?:GB)',x, flags=re.IGNORECASE):
      return number * 1000
   return np.nan
def parse_compatibility(x: str):
    matches =  re.findall(r'(32|64|86)', x)
    if matches:
        if '86' in matches:
            matches.remove('86')
            matches.insert(0, '32')
        matches = list(set(matches))
        return ", ".join(matches)
    else:
        return np.nan

In [246]:
filter = df['setup_size'].str.match('\d+(\.\d+)?') # Makes sure setup_size contains numbers

In [247]:
df = df.loc[filter]

In [249]:
df['setup_size MB'] = df['setup_size'].map(parse_memory)#str.split().str[-1].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['setup_size MB'] = df['setup_size'].map(parse_memory)#str.split().str[-1].value_counts()


In [253]:
df = df.drop(['setup_size'],axis=1)

In [279]:
df['os_architecture bits']  = df['compatibility'].map(parse_compatibility)

In [293]:
df['hdd_space'].map(parse_memory)

0          200.0
1          100.0
2          250.0
3          100.0
4         1000.0
          ...   
16452     2000.0
16453      100.0
16454     2000.0
16455      100.0
16456    10000.0
Name: hdd_space, Length: 15246, dtype: float64