## <center> Null Value treatment for Train_Demographic Using Model Based Imputation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import joblib

import warnings
warnings.filterwarnings("ignore")

In [2]:
dfd = pd.read_csv(r"C:\Users\Soorya\Documents\capstone-files\TrainData\Data\Train_Demographics.csv",na_values= ['?','MISSEDDATA','MISSINGVALUE','NA'])

In [3]:
# can drop Country as all values are same 
dfd.drop("Country",axis=1,inplace=True)

In [4]:
# seperating/storing the rows with not-null values
df2=dfd.dropna().copy()

In [5]:
df2.head()

Unnamed: 0,CustomerID,InsuredAge,InsuredZipCode,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss
0,Cust10000,35,454776,MALE,JD,armed-forces,movies,56700,-48500
1,Cust10001,36,454776,MALE,JD,tech-support,cross-fit,70600,-48500
2,Cust10002,33,603260,MALE,JD,armed-forces,polo,66400,-63700
3,Cust10003,36,474848,MALE,JD,armed-forces,polo,47900,-73400
4,Cust10004,29,457942,FEMALE,High School,exec-managerial,dancing,0,-41500


In [6]:
df2.shape

(28806, 9)

In [7]:
# index value for filtering rows with null values
df2_index = set(df2.index.values)
dfd_index = set(dfd.index.values)
df3_index = dfd_index.difference(df2_index)

In [8]:
df3 = dfd.filter(items=list(df3_index), axis=0).copy()

In [9]:
# all rows with null values
df3.head()

Unnamed: 0,CustomerID,InsuredAge,InsuredZipCode,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss
24064,Cust39239,30,450746,,High School,other-service,golf,0,-76000
3206,Cust13887,37,459878,,High School,exec-managerial,exercise,23300,0
12041,Cust24494,43,430832,,High School,craft-repair,kayaking,58300,0
7946,Cust19617,31,431389,,College,sales,paintball,55300,-68700
15505,Cust2880,34,471453,,PhD,transport-moving,movies,81300,0


In [10]:
df3.shape

(30, 9)

In [11]:
dfd_notnull = df2.copy()
dfd_null = df3.copy()

## Perform data preparation 

In [12]:
# numerical encoding the InsuredGender column
df2['InsuredGender'] = df2.InsuredGender.astype('category').cat.codes

In [13]:
#onehot encoding

# Converting type of columns to category
df2['InsuredEducationLevel'] = df2['InsuredEducationLevel'].astype('category')
df2['InsuredOccupation'] = df2['InsuredOccupation'].astype('category')
df2['InsuredHobbies'] = df2['InsuredHobbies'].astype('category')

In [14]:
# Create an instance of One-hot-encoder
enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(df2[["InsuredEducationLevel","InsuredOccupation","InsuredHobbies"]])
enc_data = pd.DataFrame(enc.transform(df2[["InsuredEducationLevel","InsuredOccupation","InsuredHobbies"]]).toarray())

In [15]:
dfd_enc = df2.join(enc_data)
dfd_enc.drop("CustomerID",axis=1,inplace=True)

In [16]:
dfd_enc.drop(["InsuredEducationLevel","InsuredOccupation","InsuredHobbies"],axis=1,inplace=True)

In [17]:
dfd_enc.shape

(28806, 46)

In [18]:
dfd_enc.head()

Unnamed: 0,InsuredAge,InsuredZipCode,InsuredGender,CapitalGains,CapitalLoss,0,1,2,3,4,...,31,32,33,34,35,36,37,38,39,40
0,35,454776,1,56700,-48500,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36,454776,1,70600,-48500,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33,603260,1,66400,-63700,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,36,474848,1,47900,-73400,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,29,457942,0,0,-41500,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
dfd_enc.isnull().sum().sum()

1230

In [20]:
dfd_enc.dropna(inplace=True)

In [21]:
x = dfd_enc.drop('InsuredGender',axis=1)
y = dfd_enc['InsuredGender']

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Training a simple model for missing value imputation

In [23]:
k = list(range(1,30,2))

knn_acc = []

for i in k:
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(x_train,y_train)
    knn_acc.append(neigh.score(x_test, y_test))

In [24]:
print("The optimal value for k is",k[np.argmax(np.array(knn_acc))],"with accuracy score",np.max(np.array(knn_acc)))

The optimal value for k is 1 with accuracy score 0.7439193884642112


In [25]:
# Training with the optimal value for k
neigh = KNeighborsClassifier(n_neighbors=k[np.argmax(np.array(knn_acc))])
neigh.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=1)

In [26]:
joblib.dump(neigh, 'Demographic.joblib')

['Demographic.joblib']

## to predict

In [27]:
# changes for the dataframe with null values
df3.drop(["InsuredGender","CustomerID"],axis=1,inplace=True)

In [28]:
df3['ind'] = np.array([x for x in range(0,30)])
originalindex = list(df3.index.values)

In [29]:
df3.set_index('ind',inplace=True)

In [30]:
df3.head()

Unnamed: 0_level_0,InsuredAge,InsuredZipCode,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,30,450746,High School,other-service,golf,0,-76000
1,37,459878,High School,exec-managerial,exercise,23300,0
2,43,430832,High School,craft-repair,kayaking,58300,0
3,31,431389,College,sales,paintball,55300,-68700
4,34,471453,PhD,transport-moving,movies,81300,0


In [31]:
# Converting type of columns to category
df3['InsuredEducationLevel'] = df3['InsuredEducationLevel'].astype('category')
df3['InsuredOccupation'] = df3['InsuredOccupation'].astype('category')
df3['InsuredHobbies'] = df3['InsuredHobbies'].astype('category')

enc_array = enc.transform(df3[["InsuredEducationLevel","InsuredOccupation","InsuredHobbies"]]).toarray()

In [32]:
enc_data = pd.DataFrame(enc_array)

In [33]:
dfd_test = df3.join(enc_data)
dfd_test.drop(["InsuredEducationLevel","InsuredOccupation","InsuredHobbies"],axis=1,inplace=True)

In [34]:
dfd_test.head()

Unnamed: 0_level_0,InsuredAge,InsuredZipCode,CapitalGains,CapitalLoss,0,1,2,3,4,5,...,31,32,33,34,35,36,37,38,39,40
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,30,450746,0,-76000,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,37,459878,23300,0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,43,430832,58300,0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,31,431389,55300,-68700,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34,471453,81300,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
dfd_test.shape

(30, 45)

In [36]:
pred = neigh.predict(dfd_test)

In [37]:
pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1], dtype=int8)

In [38]:
#adding predictions to dfd_null dataframe
dfd_null['InsuredGender'] = pred

In [39]:
dfd_null['InsuredGender'] = dfd_null.InsuredGender.apply(lambda x: "MALE" if x else "FEMALE")

In [40]:
dfd_null.head()

Unnamed: 0,CustomerID,InsuredAge,InsuredZipCode,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss
24064,Cust39239,30,450746,MALE,High School,other-service,golf,0,-76000
3206,Cust13887,37,459878,FEMALE,High School,exec-managerial,exercise,23300,0
12041,Cust24494,43,430832,FEMALE,High School,craft-repair,kayaking,58300,0
7946,Cust19617,31,431389,MALE,College,sales,paintball,55300,-68700
15505,Cust2880,34,471453,FEMALE,PhD,transport-moving,movies,81300,0


In [41]:
# appending the 2 dataframe to get the imputed null values for demographics data
dfd_final = dfd_notnull.append(dfd_null)

In [42]:
dfd_final.shape

(28836, 9)

In [43]:
dfd_final.head()

Unnamed: 0,CustomerID,InsuredAge,InsuredZipCode,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss
0,Cust10000,35,454776,MALE,JD,armed-forces,movies,56700,-48500
1,Cust10001,36,454776,MALE,JD,tech-support,cross-fit,70600,-48500
2,Cust10002,33,603260,MALE,JD,armed-forces,polo,66400,-63700
3,Cust10003,36,474848,MALE,JD,armed-forces,polo,47900,-73400
4,Cust10004,29,457942,FEMALE,High School,exec-managerial,dancing,0,-41500


In [44]:
dfd_final['InsuredGender'].value_counts()

FEMALE    15659
MALE      13177
Name: InsuredGender, dtype: int64

In [45]:
#save the file as Demographics.csv
dfd_final.to_csv("Demographics.csv")