In [12]:
import pandas as pd
import plotly.express as px
import numpy as np
import re

In [13]:
df = pd.read_csv('extracted_phones.csv')
for col in df.columns:
    try:
        pd.to_numeric(df[col])
    except:
        pass

In [14]:
df.loc[9065, 'price'] = 238.43
df.loc[7339, 'price'] = 1100

In [15]:
df['release_year'] = df['release_year'].fillna(df['announce_year'])
df['release_year'] = df['release_year'].ffill()
new_phones = df[(df['announce_year'] >= 2020) | (df['release_year'] >= 2020)].copy()

In [16]:
def label_device(row):
    model = row['model']
    brand = row['brand']
    screen_size = row['screen_size']
    
    # Check for tablet conditions
    if pd.notna(model) and ('pad' in model.lower() or 'tab' in model.lower()):
        return 'tablet'
    elif screen_size >=10.36:
        return 'tablet'
    elif brand == 'HTC' and 'A10' in model:
        return 'tablet'
    elif brand == 'Lenovo' and 'Legion Y' in model:
        return 'tablet'
    elif brand == 'Lenovo' and 'M10' in model:
        return 'tablet'
    elif brand == 'Nokia' and 'T' in model:
        return 'tablet'
    elif brand == 'TCL' and model in ['NxtPaper 14', 'NxtPaper 12 Pro', 'NxtPaper 11', 'NxtPaper 10s']:
        return 'tablet'
    elif brand == 'Doogee' and re.search(r'T\d{2}|U\d+|R\d+', model):
        return 'tablet'
    elif brand == 'Amazon' and re.search(r'Fire', model):
        return 'tablet'
    elif brand == 'Oukitel' and re.search(r'RT\d+|OT\d+', model):
        return 'tablet'
    elif brand == 'BLU' and re.search(r'M\d+', model):
        return 'tablet'
    elif brand == 'Allview' and re.search(r'Viva', model):
        return 'tablet'
    elif brand == 'Oscal' and re.search(r'Elite|Spider', model):
        return 'tablet'
    elif brand == 'alcatel' and re.search(r'3T\d*', model):
        return 'tablet'
    elif brand == 'Kyocera' and re.search(r'DuraSlate', model):
        return 'tablet'
    # Check for watch condition
    elif pd.notna(model) and 'watch' in model.lower():
        return 'watch'
    else:
        return 'phone'  # Default to phone

In [17]:
new_phones['type'] = new_phones.apply(label_device, axis=1)

In [18]:
new_phones['popularity_views_today'] = new_phones['popularity_views_today'].fillna(0) # Where nan it means that the phone recieved too little visits that day and they are close to 0
avg_model_price = new_phones.groupby(['brand', new_phones['model'].str.split().str[0]])[['price']].mean().reset_index()
new_phones['5g'] = new_phones['network_technology'].str.contains(r'5g', case=False, na=False)
new_phones['ip_rating'] = new_phones['ip_rating'].fillna('IPXX')


In [19]:
def convert_to_gb(row):
    if row['internal_rom_unit'] == 'GB':
        return row['internal_rom']  # Already in GB
    elif row['internal_rom_unit'] == 'MB':
        return row['internal_rom'] / 1024  # Convert MB to GB
    elif row['internal_rom_unit'] == 'Mb':
        return row['internal_rom'] / 1024  # Convert Mb to GB
    else:
        return None  # Handle unexpected units
    
new_phones['internal_rom_gb'] = df.apply(convert_to_gb, axis=1)

def convert_to_gb(row):
    if row['internal_ram_unit'] == 'GB':
        return row['internal_ram']  # Already in GB
    elif row['internal_ram_unit'] == 'MB':
        return row['internal_ram'] / 1024  # Convert MB to GB
    elif row['internal_ram_unit'] == 'Mb':
        return row['internal_ram'] / 1024  # Convert Mb to GB
    else:
        return None  # Handle unexpected units
    
new_phones['internal_ram_gb'] = df.apply(convert_to_gb, axis=1)


In [20]:
new_phones.columns

Index(['brand', 'model', 'photo_link', 'phone_link', 'popularity_become_fan',
       'popularity_views', 'popularity_views_today', 'price',
       'network_technology', 'eSIM', 'announce_year', 'available',
       'release_year', 'cancelled', 'height_mm', 'length_mm', 'width_mm',
       'weight_g', 'ip_rating', 'screen_type', 'screen_hz', 'screen_size',
       'screen_to_body', 'screen_resolution_x', 'screen_resolution_y',
       'os_version', 'chipset_nm', 'chipset_cores', 'gpu_model',
       'memory_card_slot', 'internal_rom', 'internal_rom_unit', 'internal_ram',
       'internal_ram_unit', 'camera_mp', 'camera_f', 'camera_video_resolution',
       'camera_video_fps', 'selfie_camera_mp', 'selfie_camera_f',
       'selfie_camera_video_resolution', 'selfie_camera_video_fps',
       'loudspeaker', '35mm_jack', 'wifi_model', 'bluetooth_version', 'gps',
       'nfc', 'radio', 'usb_type', 'usb_version', 'biometric_auth',
       'has_black_color', 'foldable', 'battery_type', 'battery_capaci

In [21]:
relevant_features = pd.DataFrame()
relevant_features[['brand', 'model', 'photo_link', 'phone_link', 'popularity_become_fan',
       'popularity_views', 'popularity_views_today', 'price', 'eSIM', 'announce_year', 'available',
       'release_year', 'cancelled', 'height_mm', 'length_mm', 'width_mm',
       'weight_g', 'ip_rating', 'screen_type', 'screen_hz', 'screen_size',
       'screen_to_body', 'screen_resolution_x', 'screen_resolution_y', 'chipset_nm', 'chipset_cores', 
       'memory_card_slot', 'internal_rom_gb', 'internal_ram_gb', 'camera_mp', 'camera_f', 'camera_video_resolution',
       'camera_video_fps', 'selfie_camera_mp', 'selfie_camera_f',
       'selfie_camera_video_resolution', 'selfie_camera_video_fps', '35mm_jack', 'wifi_model', 'bluetooth_version', 
       'nfc', 'radio', 'usb_type', 'usb_version', 'biometric_auth', 'has_black_color', 'foldable', 'battery_type', 'battery_capacity',
       'type', '5g']] = new_phones[['brand', 'model', 'photo_link', 'phone_link', 'popularity_become_fan',
       'popularity_views', 'popularity_views_today', 'price', 'eSIM', 'announce_year', 'available',
       'release_year', 'cancelled', 'height_mm', 'length_mm', 'width_mm',
       'weight_g', 'ip_rating', 'screen_type', 'screen_hz', 'screen_size',
       'screen_to_body', 'screen_resolution_x', 'screen_resolution_y', 'chipset_nm', 'chipset_cores', 
       'memory_card_slot', 'internal_rom_gb', 'internal_ram_gb', 'camera_mp', 'camera_f', 'camera_video_resolution',
       'camera_video_fps', 'selfie_camera_mp', 'selfie_camera_f',
       'selfie_camera_video_resolution', 'selfie_camera_video_fps', '35mm_jack', 'wifi_model', 'bluetooth_version', 
       'nfc', 'radio', 'usb_type', 'usb_version', 'biometric_auth', 'has_black_color', 'foldable', 'battery_type', 'battery_capacity',
       'type', '5g']]

In [22]:
relevant_features.to_csv('relevant_features.csv', index=False)