Dataset download from https://www.kaggle.com/iabhishekofficial/mobile-price-classification?select=test.csv

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

# data.csv

In [18]:
df = pd.read_csv("data.csv") #Reading the dataset in a dataframe using Pandas
df.head()

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1


In [19]:
df =  df[['battery_power', 'clock_speed', 'fc','int_memory','m_dep']]
print(df.shape)
df.head()

(1000, 5)


Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,1043,1.8,14,5,0.1
1,841,0.5,4,61,0.8
2,1807,2.8,1,27,0.9
3,1546,0.5,18,25,0.5
4,1434,1.4,11,49,0.5


# data_scaled.csv

In [4]:
df= df.astype("float")
df.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,1043.0,1.8,14.0,5.0,0.1
1,841.0,0.5,4.0,61.0,0.8
2,1807.0,2.8,1.0,27.0,0.9
3,1546.0,0.5,18.0,25.0,0.5
4,1434.0,1.4,11.0,49.0,0.5


In [20]:
df = (df - df.min()) / (df.max() - df.min())
df.to_csv("data_scaled.csv")
df.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,0.52,0.736842,0.048387,0.0
1,0.227485,0.0,0.210526,0.951613,0.777778
2,0.871915,0.92,0.052632,0.403226,0.888889
3,0.697799,0.0,0.947368,0.370968,0.444444
4,0.623082,0.36,0.578947,0.758065,0.444444


# create artificial missingness for data_scaled.csv

In [7]:
nums = np.ones(5000)
nums[:2500] = 0
np.random.shuffle(nums,)
nums = nums.reshape((1000,5))
df_missed = df.copy()

for i in range(len(nums)):
    for j in range(len(nums[0])):
        if nums[i][j]==0:
            df_missed.iloc[i,j]=np.nan
df_missed.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,,,0.048387,0.0
1,,,,0.951613,0.777778
2,,0.92,,0.403226,
3,0.697799,,0.947368,,0.444444
4,0.623082,0.36,0.578947,,0.444444


# Impute missing data by mean

In [8]:
def fillnan_mean(df_missed):
    df=df_missed.copy()
    for column in list(df.columns[df.isnull().sum() > 0]):
        mean_val = df[column].mean()
        df[column].fillna(mean_val, inplace=True)
    return df
df_impute1=fillnan_mean(df_missed)


In [9]:
df_impute1.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,0.431901,0.23795,0.048387,0.0
1,0.503364,0.431901,0.23795,0.951613,0.777778
2,0.503364,0.92,0.23795,0.403226,0.475944
3,0.697799,0.431901,0.947368,0.507889,0.444444
4,0.623082,0.36,0.578947,0.507889,0.444444


# Impute missing data by KNN & weighted KNN

In [10]:
def get_Euclidean_distance(x1,x2):
    res = np.zeros((len(x1),len(x2)))
    for i in range(len(x1)):
        for j in range(len(x2)):
            res[i][j] = np.sqrt(np.sum((x1[i,:] - x2[j,:]) ** 2))
    return res

def fillnan_knn(df_missed, distance, k):
    df=df_missed.copy()
    for index, dis in enumerate(distance):
        neighbors = np.argsort(dis)[1:k+1]
        for j in range(5):
            if np.isnan(df.iloc[index][j]):
                df.iloc[index][j] = np.sum(x1[neighbors])/k
    return df

def fillnan_weighted_knn(df_missed, distance, k):
    df=df_missed.copy()
    for index, dis in enumerate(distance):
        neighbors = np.argsort(dis)[1:k+1]
        for j in range(5):
            if np.isnan(df.iloc[index][j]):
                df.iloc[index][j] = np.sum(np.multiply(x1[neighbors,j],dis[neighbors]))/k
    return df

x1=df_impute1.values
distance = get_Euclidean_distance(x1,x1)
df_impute2=fillnan_knn(df_missed,distance,1)
df_impute3=fillnan_knn(df_missed,distance,3)
df_impute4=fillnan_knn(df_missed,distance,5)
df_impute5=fillnan_weighted_knn(df_missed,distance,1)
df_impute6=fillnan_weighted_knn(df_missed,distance,3)
df_impute7=fillnan_weighted_knn(df_missed,distance,5)


In [15]:
df_impute2.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,1.284326,1.284326,0.048387,0.0
1,2.854218,2.854218,2.854218,0.951613,0.777778
2,2.645147,0.92,2.645147,0.403226,2.645147
3,0.697799,2.761203,0.947368,2.761203,0.444444
4,0.623082,0.36,0.578947,2.433635,0.444444


In [16]:
df_impute3.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,1.185117,1.185117,0.048387,0.0
1,2.854218,2.854218,2.854218,0.951613,0.777778
2,2.631813,0.92,2.631813,0.403226,2.631813
3,0.697799,2.73418,0.947368,2.73418,0.444444
4,0.623082,0.36,0.578947,2.476575,0.444444


In [17]:
df_impute4.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,1.267025,1.267025,0.048387,0.0
1,2.798328,2.798328,2.798328,0.951613,0.777778
2,2.637147,0.92,2.637147,0.403226,2.637147
3,0.697799,2.839136,0.947368,2.839136,0.444444
4,0.623082,0.36,0.578947,2.485163,0.444444


In [126]:
df_impute5.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,2160.238229,1.8,14.0,5.0,0.1
1,841.0,0.5,39.334856,61.0,6.50544
2,1807.0,6.444399,1.0,198.910921,0.9
3,4018.497398,0.5,18.0,25.0,0.5
4,275.375077,1.4,1.048245,49.0,0.151695


In [14]:
df_impute6.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,0.099402,0.041087,0.048387,0.0
1,0.053536,0.045936,0.025308,0.951613,0.777778
2,0.053922,0.92,0.02549,0.403226,0.050985
3,0.697799,0.12517,0.947368,0.127082,0.444444
4,0.623082,0.36,0.578947,0.071495,0.444444


In [12]:
df_impute7.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,0.118783,0.041168,0.048387,0.0
1,0.065689,0.056363,0.03192,0.951613,0.777778
2,0.054914,0.92,0.025959,0.403226,0.051922
3,0.697799,0.146295,0.947368,0.146495,0.444444
4,0.623082,0.36,0.578947,0.073884,0.444444


In [13]:
def cal_mse(df1,df2):
    e = 0
    for i in range(1000):
        for j in range(5):
            if np.isnan(df2.iloc[i,j]):
                e += (df1.iloc[i,j])**2
            else:
                e += (df1.iloc[i,j]-df2.iloc[i,j])**2

    return e/5000

print(cal_mse(df_impute1,df))
print(cal_mse(df_impute2,df))
print(cal_mse(df_impute3,df))
print(cal_mse(df_impute4,df))
print(cal_mse(df_impute5,df))
print(cal_mse(df_impute6,df))
print(cal_mse(df_impute7,df))

0.043815461224774234
1.6126550056157931
1.605593132513641
1.6031344999177626
0.12468620451989218
0.12125986105220657
0.11897251326271505
