In [156]:
# Standard libraries
import os
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *
from IPython.display import display

# Machine Learning
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from scipy.cluster import hierarchy as hc
from fastai.imports import *

In [157]:
#data = pd.read_csv('etc/data/lownoise/nasa93dem.csv')
data = pd.read_csv('etc/data/highnoise/nasa93dem.csv')
data.shape

(93, 29)

In [158]:
data.iloc[0,:]

idX             1
centerX         2
YearX        1979
prec            h
flex            h
resl            h
team           vh
pmat            h
rely            h
data            l
cplx            h
ruse            n
docu            ?
time            n
stor            n
pvol            l
acap            n
pcap            n
pcon            n
apex            n
plex            n
ltex            h
tool            n
site            n
sced            l
Kloc+        25.9
Effort-     117.6
Defects-      808
Months-      15.3
Name: 0, dtype: object

In [159]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)

In [160]:
data.replace('?', np.nan, inplace=True)
datanp = data.to_numpy()
datanp[38]

array(['39', '2', '1986', 'h', 'h', 'h', 'vh', 'n', 'n', 'n', 'h', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', nan, 'n', 'n', 'n', 'n', 'n',
       8.0, 42.0, 420, 12.5], dtype=object)

In [161]:
from sklearn.preprocessing import LabelEncoder

encoders = {}
for idx in range(3,25): 
    encoders[idx] = LabelEncoder()
    encod_data = encoders[idx].fit_transform(datanp[:,idx])
    datanp[:,idx] = encod_data

In [162]:
for i in range(3,25):
    print(encoders[i].classes_)

['h' nan]
['h' nan]
['h' nan]
['vh' nan]
['h' 'l' 'n' nan]
['h' 'l' 'n' 'vh']
['h' 'l' 'n' 'vh' nan]
['h' 'l' 'n' 'vh' 'xh' nan]
['n' nan]
['n' nan]
['h' 'n' 'vh' 'xh' nan]
['h' 'n' 'vh' 'xh' nan]
['h' 'l' 'n' nan]
['h' 'n' 'vh' nan]
['h' 'n' 'vh' nan]
['n' nan]
['h' 'l' 'n' 'vh' nan]
['h' 'l' 'n' 'vl' nan]
['h' 'l' 'n' 'vl' nan]
['h' 'n' nan]
['n' nan]
['l' 'n' nan]


In [163]:
def addnan(encoder , datanp, col):
    classes1 = list(encoder.classes_)
    if np.nan in classes1:
        id = classes1.index(np.nan)
        for i,value in enumerate(datanp[:,col]):
            if value == id:
                datanp[i][col] = np.nan

In [164]:
for idx in range(3,25):
    addnan(encoders[idx] , datanp, idx)

In [165]:
dataimp = imp.fit_transform(datanp)
dataimp[:,6]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [166]:
datanp[:,7]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0,
       0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, nan, 2, 0, 2, 2, 0, 0, 0, 0,
       2, nan, 0, 0, 1, 1, nan, 1, 1, 1, 1, 0, 0, 0, 1, nan, 0, 0, 2, 2,
       2, nan, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 2, 2, 2], dtype=object)

In [208]:
dataimp[:,16]

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        2.,  2.,  2.,  2.,  2.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0., -0.,
        0.,  1.,  1.,  2.,  0.,  1.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0., -0.,  0.,  0.,  0.,  0.,  0.,  0., -0.,  0.,
        0.,  0.,  1.,  0.,  2.,  2.,  0.,  0., -0.,  0.,  0.,  0.,  1.,
        1.,  1.])

In [209]:
dataimp[:,:-4] = np.round(dataimp[:,:-4])
dataimp[:,-2] = np.round(dataimp[:,-2])

In [210]:
res = dataimp[:,:3]
for idx in range(3,25):
    decode = encoders[idx].inverse_transform(dataimp[:,idx].astype(np.int32))
    res = np.column_stack((res, decode))
res = np.column_stack((res, dataimp[:,25:]))
res

array([[1.0, 2.0, 1979.0, ..., 117.6, 808.0, 15.3],
       [2.0, 2.0, 1979.0, ..., 117.6, 767.0, 15.0],
       [3.0, 2.0, 1979.0, ..., 31.2, 240.0, 10.1],
       ...,
       [91.0, 2.0, 1981.0, ..., 480.0, 1253.0, 21.5],
       [92.0, 2.0, 1983.0, ..., 12.0, 477.0, 15.4],
       [93.0, 2.0, 1983.0, ..., 38.0, 231.0, 12.0]], dtype=object)

In [211]:
dfnew = pd.DataFrame(res, columns=data.columns)
#dfnew.to_csv("iter_low_nasa.csv", index=False)
dfnew.to_csv("iter_high_nasa.csv", index=False)