In [26]:
import pandas as pd

In [27]:
df = pd.read_csv('relevant_features.csv')
df = df[~df['cancelled']].copy()

Fill nan numeric values with the mean of brand and model first words mean value of this attribute. Else fill with brand mean value. Else fill with dataset mean value.

Do the same for categorical attributes but with mode instead.

In [28]:
def get_first_mode(series):
  modes = series.mode()
  if modes.empty:
    return pd.NA # Or np.nan, or None, depending on preference
  return modes.iloc[0]

In [29]:
# Selecting first two words of phone model
df['model_first_two_words'] = df['model'].str.split().str[:2].str.join(' ')

# Selecting features and finding mean
features = ['price', 'height_mm', 'length_mm', 'width_mm', 'weight_g', 'screen_to_body', 'chipset_nm', 'internal_rom_gb', 'internal_ram_gb', 
            'camera_mp', 'camera_f', 'camera_video_fps', 'selfie_camera_mp', 'selfie_camera_f', 'selfie_camera_video_fps', 'battery_capacity', 
            'screen_resolution_x', 'screen_resolution_y']
features_avg = [feature + '_avg' for feature in features]

cat_features = ['chipset_cores', 'camera_video_resolution', 'selfie_camera_video_resolution', 'wifi_model', 'bluetooth_version', 'usb_type', 
                'usb_version', 'battery_type']
cat_features_mode = [feature + '_mode' for feature in cat_features]

avg_model_feature = df.groupby(['brand', 'model_first_two_words'])[features].mean()
mode_model_feature = df.groupby(['brand', 'model_first_two_words'])[cat_features].agg(get_first_mode)

# Merge features to df
df = df.join(avg_model_feature, on=['brand', 'model_first_two_words'], rsuffix='_avg', how='left')
df = df.join(mode_model_feature, on=['brand', 'model_first_two_words'], rsuffix='_mode', how='left')

# Fill nan values for each feature
for i in range(len(features)):
    df[features[i]] = df[features[i]].fillna(df[features_avg[i]])
for i in range(len(cat_features)):
    df[cat_features[i]] = df[cat_features[i]].fillna(df[cat_features_mode[i]])

df.drop(columns=['model_first_two_words'], inplace=True)
df.drop(columns=features_avg, inplace=True)
df.drop(columns=cat_features_mode, inplace=True)

In [30]:
# Selecting first two words of phone model
df['model_first_word'] = df['model'].str.split().str[0]

# Selecting features and finding mean
avg_model_price = df.groupby(['brand', 'model_first_word'])[features].mean()
mode_model_feature = df.groupby(['brand', 'model_first_word'])[cat_features].agg(get_first_mode)

# Merge features to df
df = df.join(avg_model_price, on=['brand', 'model_first_word'], rsuffix='_avg', how='left')
df = df.join(mode_model_feature, on=['brand', 'model_first_word'], rsuffix='_mode', how='left')

# Fill nan values for each feature
for i in range(len(features)):
    df[features[i]] = df[features[i]].fillna(df[features_avg[i]])
for i in range(len(cat_features)):
    df[cat_features[i]] = df[cat_features[i]].fillna(df[cat_features_mode[i]])

df.drop(columns=['model_first_word'], inplace=True)
df.drop(columns=features_avg, inplace=True)
df.drop(columns=cat_features_mode, inplace=True)

In [31]:
avg_brand_price = df.groupby(['brand'])[features].mean()
mode_brand_feature = df.groupby(['brand'])[cat_features].agg(get_first_mode)

df = df.join(avg_brand_price, on='brand', rsuffix='_avg', how='left')
df = df.join(mode_brand_feature, on='brand', rsuffix='_mode', how='left')

for i in range(len(features)):
    df[features[i]] = df[features[i]].fillna(df[features_avg[i]])
for i in range(len(cat_features)):
    df[cat_features[i]] = df[cat_features[i]].fillna(df[cat_features_mode[i]])

df.drop(columns=features_avg, inplace=True)
df.drop(columns=cat_features_mode, inplace=True)

  df[cat_features[i]] = df[cat_features[i]].fillna(df[cat_features_mode[i]])


In [32]:
for i in range(len(features)):
    df[features[i]] = df[features[i]].fillna(df[features[i]].mean())
for i in range(len(cat_features)):
    df[cat_features[i]] = df[cat_features[i]].fillna(df[cat_features[i]].mode())

In [33]:
df['eSIM'] = df['eSIM'].fillna(False)

  df['eSIM'] = df['eSIM'].fillna(False)


In [34]:
df['announce_year'] = df['announce_year'].fillna(df['release_year'])

In [35]:
missing = df[df['battery_type'].isna()][['brand','phone_link','battery_type']]

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3604 entries, 0 to 3640
Data columns (total 51 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   brand                           3604 non-null   object 
 1   model                           3604 non-null   object 
 2   photo_link                      3604 non-null   object 
 3   phone_link                      3604 non-null   object 
 4   popularity_become_fan           3604 non-null   int64  
 5   popularity_views                3604 non-null   int64  
 6   popularity_views_today          3604 non-null   float64
 7   price                           3604 non-null   float64
 8   eSIM                            3604 non-null   bool   
 9   announce_year                   3604 non-null   float64
 10  available                       3604 non-null   bool   
 11  release_year                    3604 non-null   float64
 12  cancelled                       3604 no

In [37]:
df.to_csv('clean_phones.csv', index=False)