Dataset download from https://www.kaggle.com/iabhishekofficial/mobile-price-classification?select=test.csv

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

# data.csv

In [37]:
df = pd.read_csv("data.csv") #Reading the dataset in a dataframe using Pandas
df.head()

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1


In [38]:
df =  df[['battery_power', 'clock_speed', 'fc','int_memory','m_dep']]
print(df.shape)
df.head()

(1000, 5)


Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,1043,1.8,14,5,0.1
1,841,0.5,4,61,0.8
2,1807,2.8,1,27,0.9
3,1546,0.5,18,25,0.5
4,1434,1.4,11,49,0.5


# data_scaled.csv

In [39]:
df= df.astype("float")
df.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,1043.0,1.8,14.0,5.0,0.1
1,841.0,0.5,4.0,61.0,0.8
2,1807.0,2.8,1.0,27.0,0.9
3,1546.0,0.5,18.0,25.0,0.5
4,1434.0,1.4,11.0,49.0,0.5


In [40]:
df_scaled = (df - df.min()) / (df.max() - df.min())
df_scaled.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,0.362241,0.52,0.736842,0.048387,0.0
1,0.227485,0.0,0.210526,0.951613,0.777778
2,0.871915,0.92,0.052632,0.403226,0.888889
3,0.697799,0.0,0.947368,0.370968,0.444444
4,0.623082,0.36,0.578947,0.758065,0.444444


# create artificial missingness for data.csv

In [54]:
nums = np.ones(5000)
nums[:2500] = 0
np.random.shuffle(nums,)
nums = nums.reshape((1000,5))
df_missed = df.copy()

for i in range(len(nums)):
    for j in range(len(nums[0])):
        if nums[i][j]==0:
            df_missed.iloc[i,j]=np.nan
df_missed.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,,1.8,14.0,5.0,0.1
1,841.0,0.5,,61.0,
2,1807.0,,1.0,,0.9
3,,0.5,18.0,25.0,0.5
4,,1.4,,49.0,


# Impute missing data by mean

In [97]:
def fillnan_mean(df_missed):
    df=df_missed.copy()
    for column in list(df.columns[df.isnull().sum() > 0]):
        mean_val = df[column].mean()
        df[column].fillna(mean_val, inplace=True)
    return df
df_impute1=fillnan_mean(df_missed)


In [98]:
df_impute1.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,1270.72837,1.8,14.0,5.0,0.1
1,841.0,0.5,4.837165,61.0,0.522472
2,1807.0,1.52428,1.0,33.952278,0.9
3,1270.72837,0.5,18.0,25.0,0.5
4,1270.72837,1.4,4.837165,49.0,0.522472


# Impute missing data by KNN & weighted KNN

In [123]:
def get_Euclidean_distance(x1,x2):
    res = np.zeros((len(x1),len(x2)))
    for i in range(len(x1)):
        for j in range(len(x2)):
            res[i][j] = np.sqrt(np.sum((x1[i,:] - x2[j,:]) ** 2))
    return res

def fillnan_knn(df_missed, distance, k):
    df=df_missed.copy()
    for index, dis in enumerate(distance):
        neighbors = np.argsort(dis)[1:k+1]
        for j in range(5):
            if np.isnan(df.iloc[index][j]):
                df.iloc[index][j] = np.sum(x1[neighbors])/k
    return df

def fillnan_weighted_knn(df_missed, distance, k):
    df=df_missed.copy()
    for index, dis in enumerate(distance):
        neighbors = np.argsort(dis)[1:k+1]
        for j in range(5):
            if np.isnan(df.iloc[index][j]):
                df.iloc[index][j] = np.sum(np.multiply(x1[neighbors,j],dis[neighbors]))/k
    return df

x1=df_impute1.values
distance = get_Euclidean_distance(x1,x1)
df_impute2=fillnan_knn(df_missed,distance,1)
df_impute3=fillnan_knn(df_missed,distance,3)
df_impute4=fillnan_knn(df_missed,distance,5)
df_impute5=fillnan_weighted_knn(df_missed,distance,1)
df_impute6=fillnan_weighted_knn(df_missed,distance,3)
df_impute7=fillnan_weighted_knn(df_missed,distance,5)


In [128]:
df_impute2.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,1290.92837,1.8,14.0,5.0,0.1
1,841.0,0.5,902.161445,61.0,902.161445
2,1807.0,1851.57475,1.0,1851.57475,0.9
3,1312.750842,0.5,18.0,25.0,0.5
4,1326.789815,1.4,1326.789815,49.0,1326.789815


In [129]:
df_impute3.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,1292.352047,1.8,14.0,5.0,0.1
1,841.0,0.5,909.022611,61.0,909.022611
2,1807.0,1845.59126,1.0,1845.59126,0.9
3,1312.684778,0.5,18.0,25.0,0.5
4,1325.897305,1.4,1325.897305,49.0,1325.897305


In [130]:
df_impute4.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,1292.827071,1.8,14.0,5.0,0.1
1,841.0,0.5,895.833567,61.0,895.833567
2,1807.0,1843.996395,1.0,1843.996395,0.9
3,1317.212476,0.5,18.0,25.0,0.5
4,1325.537164,1.4,1325.537164,49.0,1325.537164


In [131]:
df_impute5.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,2160.238229,1.8,14.0,5.0,0.1
1,841.0,0.5,39.334856,61.0,6.50544
2,1807.0,6.444399,1.0,198.910921,0.9
3,4018.497398,0.5,18.0,25.0,0.5
4,275.375077,1.4,1.048245,49.0,0.151695


In [132]:
df_impute6.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,4552.788241,1.8,14.0,5.0,0.1
1,841.0,0.5,111.314523,61.0,8.806159
2,1807.0,15.369931,1.0,296.006143,0.9
3,6685.599389,0.5,18.0,25.0,0.5
4,966.783922,1.4,3.680167,49.0,0.296597


In [125]:
df_impute7.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep
0,6175.116481,1.8,14.0,5.0,0.1
1,841.0,0.5,96.684457,61.0,10.252029
2,1807.0,17.119283,1.0,309.119147,0.9
3,8578.845712,0.5,18.0,25.0,0.5
4,1325.947379,1.4,4.781262,49.0,0.439657


In [127]:
def cal_mse(df1,df2):
    e = 0
    for i in range(1000):
        for j in range(5):
            if np.isnan(df2.iloc[i,j]):
                e += (df1.iloc[i,j])**2
            else:
                e += (df1.iloc[i,j]-df2.iloc[i,j])**2

    return e/5000

print(cal_mse(df_impute1,df))
print(cal_mse(df_impute2,df))
print(cal_mse(df_impute3,df))
print(cal_mse(df_impute4,df))
print(cal_mse(df_impute5,df))
print(cal_mse(df_impute6,df))
print(cal_mse(df_impute7,df))

19176.537624870347
732203.9622306734
732076.6161024814
731975.0898836509
143537.34438349918
173730.0267967494
227252.0198229978
