### Load data from csv

In [1]:
# Load data from csv
import pandas as pd
import numpy as np
import sidetable
from matplotlib import pyplot as plt
%matplotlib inline

df = pd.read_csv('data.csv')
df.columns = df.columns.str.lower()


### Preprocess data

In [2]:
# Drop unnecessary columns and rows having missing / null values
columns_to_drop = [1, 3, 4, 5, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 87]
df.drop(labels=df.columns[columns_to_drop], axis=1, inplace=True)

# Turn columns to lower case
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.set_index('id', inplace=True)

In [3]:
# Remove currency symbols in Wage and Value columns, turn them into numeric type, and change the column names
df['wage'] = np.where(df['wage'].str[-1].isin('M K'.split()), df['wage'].str[1:-1], np.nan)
df['value'] = np.where(df['value'].str[-1].isin('M K'.split()), df['value'].str[1:-1], np.nan)
df['wage'] = df['wage'].astype(dtype='float')
df['value'] = df['value'].astype(dtype='float')

In [4]:
df['wage'].unique()

array([565., 405., 290., 260., 355., 340., 420., 455., 380.,  94., 205.,
       125., 285., 225., 145., 240., 315., 200., 130., 300., 215., 100.,
       255., 165., 265., 160., 150., 245., 110.,  77., 115., 210., 195.,
       230., 250., 135., 155., 180., 175., 190., 185.,  21.,  82.,  73.,
        92.,  88.,  96., 170.,  66., 235.,  28., 105.,  38.,  81.,  57.,
        15.,  63.,  22.,  84., 120.,  90.,  72.,  93.,  45.,  74.,  51.,
        42.,  31.,  75.,  25., 140.,  41.,  78.,  53.,  95.,  80.,  43.,
        60.,  85.,  64.,  67.,  18.,  70.,  91.,  20.,  49.,  87.,  86.,
        26.,  29.,  55.,  35.,  33.,  56.,  30.,  11.,  59.,  23.,  46.,
        39.,  32.,  36.,  98.,  54.,  68.,  58.,  27.,  40.,  44.,  19.,
         1.,  61.,  50.,  99.,  17.,  52.,  62.,  12.,  10.,  71.,  14.,
        76.,  48.,  65.,  69.,  24.,  34.,  16.,  37.,  47.,  89.,  nan,
        97.,  79.,  13.,  83.,   6.,   3.,   9.,   8.,   7.,   4.,   2.,
         5.])

In [5]:
'''
    Convert work_rate into numeric type using the following map:
        Low = 1 point, Medium = 2 points, High = 3 points    
'''
def work_rate(x):
    if x in ['Low/ Low']:
        return 2
    elif x in ['Low/ Medium', 'Medium/ Low']:
        return 3
    elif x in ['Low/ High', 'Medium/ Medium', 'High/ Low']:
        return 4
    elif x in ['High/ Medium', 'Medium/ High']:
        return 5
    elif x in ['High/ High']:
        return 6

df['work_rate'] = df['work_rate'].apply(work_rate)

In [6]:
# This function is to remove the plus sign and add the two numbers together in columns ['ls': 'rb']
def clean_plus_sign(x):
    if isinstance(x, str):
        x = x.split('+')
        x = list(map(int, x))
        return sum(x)
    
for i in df.loc[:,'ls': 'rb'].columns:
    df[i] = df[i].apply(clean_plus_sign)

In [7]:
X = df.drop(columns='value')
y = df.value.copy()

In [8]:
# Retrive num_cols and cat_cols
num_cols = list(X._get_numeric_data().columns) # Another way: X.select_dtypes(include='number').columns
cat_cols = list(set(X.columns) - set(num_cols)) # Another way: X.select_dtypes(exclude='number').columns

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

cat_pipe = make_pipeline(
            (SimpleImputer(strategy='most_frequent')),
            (OneHotEncoder(drop='first', handle_unknown='error'))
            )
num_pipe = make_pipeline(
            (KNNImputer()),
            (StandardScaler())
            )            
preprocess_pipeline = make_column_transformer(
            (cat_pipe, cat_cols),
            (num_pipe, num_cols)
            )

In [10]:
preprocess_pipeline.fit_transform(X)

array([[ 0.        ,  1.25867833,  4.01828714, ..., -0.07475808,
        -0.14042651, -0.485802  ],
       [ 1.        ,  1.68696087,  4.01828714, ..., -0.07475808,
        -0.14042651, -0.31849344],
       [ 1.        ,  0.18797198,  3.72879875, ..., -0.07475808,
        -0.08164355, -0.31849344],
       ...,
       [ 1.        , -1.95344072, -2.78469008, ..., -0.37814431,
        -0.61069023, -0.2069544 ],
       [ 1.        , -1.73929945, -2.78469008, ..., -0.13543533,
        -0.4931243 , -0.43003248],
       [ 1.        , -1.95344072, -2.92943428, ..., -0.43882155,
        -0.25799244, -0.43003248]])