In [150]:
import pandas as pd
import re
import math

In [24]:
# Load df from json
df = pd.read_json('./phones_all_data.json')

In [9]:
# Define a function to extract the brand
def extract_brand(name):
    if "apple" in name.lower():
        return "Apple"
    elif "samsung" in name.lower():
        return "Samsung"
    elif "motorola" in name.lower():
        return "Motorola"
    elif "nokia" in name.lower():
        return "Nokia"
    elif "kyocera" in name.lower():
        return "Kyocera"
    elif "google" in name.lower():
        return "Google"
    elif "tcl" in name.lower():
        return "TCL"
    elif "sonim" in name.lower():
        return "Sonim"

In [10]:
df['brand'] = df['name'].apply(extract_brand)

In [65]:
def convert_storage(storage_str):
    if storage_str == None:
        return None
    storage_str = storage_str.lower()
    if "," in storage_str:
        storage_list = [int(re.sub("[^0-9]", "", s).strip()) for s in storage_str.split(',')]
    elif "|" in storage_str:
        storage_list = [int(re.sub("[^0-9]", "", s).strip()) for s in storage_str.split('|')]
    else:
        storage_list = [int(re.sub("[^0-9]", "", storage_str).strip())]
    return storage_list

In [66]:
df['storage'] = df['storage'].apply(convert_storage)

In [69]:
df['used'] = df['name'].apply(lambda x: "owned" in x.lower())

In [74]:
# Remove (Certified Pre-Owned) from name
df['name'] = df['name'].apply(lambda x: re.sub(r"\(Certified Pre-Owned\)", "", x).strip())

In [84]:
def compute_screen_size(width, height):
    if width == None or height == None:
        return None
    if "Folded" not in width:
        width = float(width.strip("inches").strip("in.").strip("in").strip())
    else:
        width = float(width[width.find("Unfolded")::].strip("Unfolded:").strip("Unfolded").strip("in.").strip("in").strip())
    if "Folded" not in height:
        height = float(height.strip("inches").strip("in.").strip("in").strip())
    else:
        height = float(height[height.find("Unfolded")::].strip("Unfolded:").strip("Unfolded").strip("in.").strip("in").strip())
    return round((width**2 + height**2)**0.5, 2)
df['screen_size'] = df.apply(lambda x: compute_screen_size(x['width'], x['height']), axis=1)

Normalize camera categories

In [139]:
updated_camera_categories = {'general', 'modes', 'front', 'rear', "video"}

In [141]:
def normalize_camera_categories(camera_dict):
    if camera_dict == None:
        return None
    if 'Camera' in camera_dict.keys():
        camera_dict['general'] = camera_dict['Camera']
        del camera_dict['Camera']
    if 'Camera Modes' in camera_dict.keys():
        camera_dict['modes'] = camera_dict['Camera Modes']
        del camera_dict['Camera Modes']
    if 'Camera Mode' in camera_dict.keys():
        camera_dict['modes'] = camera_dict['Camera Mode']
        del camera_dict['Camera Mode']
    if "camera" in camera_dict.keys():
        camera_dict['general'] = camera_dict['camera']
        del camera_dict['camera']
    if "Main Lens" in camera_dict.keys():
        camera_dict['general'] = camera_dict['Main Lens']
        del camera_dict['Main Lens']
    if "Front Camera" in camera_dict.keys():
        camera_dict['front'] = camera_dict['Front Camera']
        del camera_dict['Front Camera']
    if "Rear Camera" in camera_dict.keys():
        camera_dict['rear'] = camera_dict['Rear Camera']
        del camera_dict['Rear Camera']
    if "Video" in camera_dict.keys():
        camera_dict['video'] = camera_dict['Video']
        del camera_dict['Video']
    for category in updated_camera_categories:
        if category not in camera_dict.keys():
            camera_dict[category] = None
    return camera_dict

In [142]:
df['camera'] = df['camera'].apply(normalize_camera_categories)

In [33]:
df.to_json('./phones_all_data.json', orient='records')

Separating colors/storages into separate phones

In [34]:
df = pd.read_json('./phones_all_data.json')

In [74]:
expanded = {'name': [], 'color': [], 'images': [], 'storage': [], 'used': [], 'brand': [], 'screen_size': [], 'camera': [], 'id': [], 'description': [], 'url': [], 'price': [], 'battery': []}

In [75]:
for index, row in df.iterrows():
    colors = row['colors']
    images = row['images']

    # Iterate through each color in the 'colors' list
    if colors is None:
        expanded['name'].append(row['name'])
        expanded['color'].append(color)
        if images is not None:
            expanded['images'].append(images.get(color, []))
        else:
            expanded['images'].append(None)
        expanded['storage'].append(row['storage'])
        expanded['used'].append(row['used'])
        expanded['brand'].append(row['brand'])
        expanded['screen_size'].append(row['screen_size'])
        expanded['camera'].append(row['camera'])
        expanded['id'].append(row['id'])
        expanded['description'].append(row['description'])
        expanded['url'].append(row['url'])
        expanded['price'].append(row['price'])
        expanded['battery'].append(row['battery'])
        continue
    for color in colors:
        # Append the data for the expanded DataFrame
        expanded['name'].append(row['name'])
        expanded['color'].append(color)
        if images is not None: 
            expanded['images'].append(images.get(color, []))
        else:
            expanded['images'].append(None)
        expanded['storage'].append(row['storage'])
        expanded['used'].append(row['used'])
        expanded['brand'].append(row['brand'])
        expanded['screen_size'].append(row['screen_size'])
        expanded['camera'].append(row['camera'])
        expanded['id'].append(row['id'])
        expanded['description'].append(row['description'])
        expanded['url'].append(row['url'])
        expanded['price'].append(row['price'])
        expanded['battery'].append(row['battery'])

In [76]:
df_expanded = pd.DataFrame(expanded)

In [77]:
expanded2 = {'name': [], 'color': [], 'images': [], 'storage': [], 'used': [], 'brand': [], 'screen_size': [], 'camera': [], 'id': [], 'description': [], 'url': [], 'price': [], 'battery': []}

In [78]:
for index, row in df_expanded.iterrows():
    storages = row['storage']

    # Iterate through each color in the 'colors' list
    if storages is None:
        expanded2['name'].append(row['name'])
        expanded2['color'].append(row['color'])
        expanded2['images'].append(row['images'])
        expanded2['storage'].append(row['storage'])
        expanded2['used'].append(row['used'])
        expanded2['brand'].append(row['brand'])
        expanded2['screen_size'].append(row['screen_size'])
        expanded2['camera'].append(row['camera'])
        expanded2['id'].append(row['id'])
        expanded2['description'].append(row['description'])
        expanded2['url'].append(row['url'])
        expanded2['price'].append(row['price'])
        expanded2['battery'].append(row['battery'])
        continue
    for storage in storages:
        # Append the data for the expanded2 DataFrame
        expanded2['name'].append(row['name'])
        expanded2['color'].append(row['color'])
        expanded2['images'].append(row['images'])
        expanded2['storage'].append(storage)
        expanded2['used'].append(row['used'])
        expanded2['brand'].append(row['brand'])
        expanded2['screen_size'].append(row['screen_size'])
        expanded2['camera'].append(row['camera'])
        expanded2['id'].append(row['id'])
        expanded2['description'].append(row['description'])
        expanded2['url'].append(row['url'])
        expanded2['price'].append(row['price'])
        expanded2['battery'].append(row['battery'])

In [79]:
df_expanded2 = pd.DataFrame(expanded2)

In [80]:
# create id function for apply
def create_ids(row):
    if row['color'] == None:
        color = ""
    else:
        color = "-" + row['color'].strip().lower().replace(" ", "-")
    if math.isnan(row['storage']):
        storage = ""
    else:
        storage = "-" + str(int(row['storage']))
    if row['used'] == True:
        return row['name'].strip().lower().replace(" ", "-") + color + storage + "-used"
    else:
        return row['name'].strip().lower().replace(" ", "-") + color + storage + "-new"
# create id column
df_expanded2['id'] = df_expanded2.apply(create_ids, axis=1)

In [85]:
df_expanded2.to_json('./phones_expanded.json', orient='records')

Getting rid of unnecessary columns

In [151]:
df = pd.read_json('./phones_expanded.json')

# Remove depth, weight, height, and width columns
df = df.drop(columns=['images', 'url'])

In [152]:
df.to_json('./phones_compressed.json', orient='records')

In [159]:
df = pd.read_json('./phones_compressed.json')

In [168]:
pattern = re.compile(r'(\d+) ?MP')

def new_camera_metric(row):
    max_mp = 0
    for key,val in row['camera'].items():
        if val is not None:
            matches = pattern.findall(val)
            if matches:
                max_mp = max(max_mp, max([int(match) for match in matches]))
    return max_mp

In [170]:
df['camera'] = df.apply(new_camera_metric, axis=1)

In [172]:
df.to_json('./phones_compressed.json', orient='records')