In [235]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from zipfile import ZipFile
from functools import reduce

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [236]:
!pip install -q gdown httpimport
!gdown 'https://github.com/wenjie-hoo/baltic_ml/raw/main/2022.11.07-merged-single-observation.zip'

Downloading...
From: https://github.com/wenjie-hoo/baltic_ml/raw/main/2022.11.07-merged-single-observation.zip
To: /Users/clarkhu/Desktop/ML/baltic_ml/2022.11.07-merged-single-observation.zip
100%|████████████████████████████████████████| 329k/329k [00:00<00:00, 4.35MB/s]


In [237]:
# load data to data frame
zip_file = ZipFile('./2022.11.07-merged-single-observation.zip')
dfs = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
       for text_file in zip_file.infolist()
       if text_file.filename.endswith('.csv') and not text_file.filename[37:].startswith('.')}

df_list=[]
for df in dfs.values():
       df['DATE'] = pd.to_datetime(df['DATE'],dayfirst = True)
       df_list.append(df)

df = reduce(lambda left,right: pd.merge(left.drop_duplicates(subset=['DATE','DEPTH']),right.drop_duplicates(subset=['DATE','DEPTH']),on=['DATE','DEPTH'],how='outer'), df_list)
# df.to_csv('xxxxx.csv')

In [238]:
df_missing = df.drop(['DATE'],axis=1)
# df_missing = df.copy()
missing = df_missing.isna().sum()
missing = pd.DataFrame(data={'elements': missing.index,'missing':missing.values})
missing = missing[~missing['missing'].isin([0])]
missing['proportion'] =  missing['missing']/df_missing.shape[0]
missing.sort_values(by='proportion',ascending=False)


Unnamed: 0,elements,missing,proportion
8,PH,4011,0.841233
12,TN,2167,0.454488
5,NO2,2062,0.432466
4,NH4,1998,0.419044
11,SIO2,1920,0.402685
6,NO3,1886,0.395554
10,SECCHI,1796,0.376678
1,CHLORA,1773,0.371854
9,PO4,1202,0.252097
13,TP,801,0.167995


In [239]:
df_missing[missing['elements']].info()
df_missing = df_missing.drop(['DEPTH'],axis=1)
df_missing = df_missing.drop(['PH'],axis=1) # PH has too many missing value
# df_missing.head

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4768 entries, 0 to 4767
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   CHLORA  2995 non-null   float64
 1   CTDSAL  4510 non-null   float64
 2   CTDTMP  4574 non-null   float64
 3   NH4     2770 non-null   float64
 4   NO2     2706 non-null   float64
 5   NO3     2882 non-null   float64
 6   OXY     4406 non-null   float64
 7   PH      757 non-null    float64
 8   PO4     3566 non-null   float64
 9   SECCHI  2972 non-null   float64
 10  SIO2    2848 non-null   float64
 11  TN      2601 non-null   float64
 12  TP      3967 non-null   float64
dtypes: float64(13)
memory usage: 521.5 KB


In [240]:
n_samples = df_missing.shape[0]
n_features = df_missing.shape[1]

In [241]:
rng = np.random.RandomState(0)
missing_rate = 0.5
n_missing_samples = int(np.floor(n_samples * n_features * missing_rate))
n_missing_samples 

28608

In [242]:
missing_features = rng.randint(0,n_features,n_missing_samples)
missing_samples = rng.randint(0,n_samples,n_missing_samples)

In [243]:
X_missing = df_missing.copy()
y_missing = df_missing.copy()
y_missing.dropna(inplace=True)
X_missing = pd.DataFrame(X_missing)
y_missing = pd.DataFrame(y_missing)

In [244]:
X_missing

Unnamed: 0,CHLORA,CTDSAL,CTDTMP,NH4,NO2,NO3,OXY,PO4,SECCHI,SIO2,TN,TP
0,2.9,17.9,2.2,,,,515.6,0.03,,,,0.68
1,26.5,20.0,1.5,,,,396.9,0.27,,,,2.26
2,2.8,17.8,2.2,,,,421.9,0.15,,,,1.19
3,7.9,18.7,1.9,,,,478.1,0.10,,,,4.91
4,6.6,18.5,2.1,,,,471.9,0.16,,,,0.84
...,...,...,...,...,...,...,...,...,...,...,...,...
4763,,20.5,1.4,,,,375.0,0.55,,,,0.84
4764,,20.5,1.3,,,,412.5,0.59,,,,1.13
4765,,20.4,1.6,,,,356.3,0.73,,,,0.90
4766,,20.4,1.5,,,,462.5,0.42,,,,0.58


In [245]:
# fill with mean value
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_missing_mean = imp_mean.fit_transform(X_missing)

In [246]:
X_missing_mean = pd.DataFrame(X_missing_mean)
X_missing_mean.isnull().sum() # check nan value

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
dtype: int64

In [247]:
# fill with 0
imp_0 = SimpleImputer(missing_values=np.nan,strategy="constant",fill_value=0)
X_missing_0 = imp_0.fit_transform(X_missing)

In [248]:
# regression method
X_missing_reg = X_missing.copy()  
sortindex = np.argsort(X_missing_reg.isnull().sum(axis=0)).values  
# sortindex


In [249]:
for i in sortindex:
    print(i)

2
1
6
11
7
0
8
5
9
3
4
10


In [250]:
for i in sortindex:
    df_ = X_missing_reg
    
    fillc = df_.iloc[:,i] 
    print(fillc)
    df_ = pd.concat([df_.iloc[:,df_.columns != i],pd.DataFrame(y_missing)],axis=1)

    df_0 = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0).fit_transform(df_)
    # print(df_0)

    Ytrain = fillc[fillc.notnull()]  
    Ytest = fillc[fillc.isnull()]  
    
    Xtrain = df_0[Ytrain.index,:]  
    Xtest = df_0[Ytest.index,:]  
    # print('Xtest:',Xtest)
    rfc = RandomForestRegressor(n_estimators = 100)
    rfc = rfc.fit(Xtrain,Ytrain)  
    Ypredict = rfc.predict(Xtest)

    X_missing_reg.loc[X_missing_reg.iloc[:,i].isnull(),i] = Ypredict

0       2.2
1       1.5
2       2.2
3       1.9
4       2.1
       ... 
4763    1.4
4764    1.3
4765    1.6
4766    1.5
4767    1.6
Name: CTDTMP, Length: 4768, dtype: float64
0       17.9
1       20.0
2       17.8
3       18.7
4       18.5
        ... 
4763    20.5
4764    20.5
4765    20.4
4766    20.4
4767    20.4
Name: CTDSAL, Length: 4768, dtype: float64




0       515.6
1       396.9
2       421.9
3       478.1
4       471.9
        ...  
4763    375.0
4764    412.5
4765    356.3
4766    462.5
4767    343.8
Name: OXY, Length: 4768, dtype: float64




0       0.68
1       2.26
2       1.19
3       4.91
4       0.84
        ... 
4763    0.84
4764    1.13
4765    0.90
4766    0.58
4767    1.10
Name: TP, Length: 4768, dtype: float64




0       0.03
1       0.27
2       0.15
3       0.10
4       0.16
        ... 
4763    0.55
4764    0.59
4765    0.73
4766    0.42
4767    0.54
Name: PO4, Length: 4768, dtype: float64




0        2.9
1       26.5
2        2.8
3        7.9
4        6.6
        ... 
4763     NaN
4764     NaN
4765     NaN
4766     NaN
4767     NaN
Name: CHLORA, Length: 4768, dtype: float64




0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
4763   NaN
4764   NaN
4765   NaN
4766   NaN
4767   NaN
Name: SECCHI, Length: 4768, dtype: float64




0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
4763   NaN
4764   NaN
4765   NaN
4766   NaN
4767   NaN
Name: NO3, Length: 4768, dtype: float64




0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
4763   NaN
4764   NaN
4765   NaN
4766   NaN
4767   NaN
Name: SIO2, Length: 4768, dtype: float64




0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
4763   NaN
4764   NaN
4765   NaN
4766   NaN
4767   NaN
Name: NH4, Length: 4768, dtype: float64




0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
4763   NaN
4764   NaN
4765   NaN
4766   NaN
4767   NaN
Name: NO2, Length: 4768, dtype: float64




0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
4763   NaN
4764   NaN
4765   NaN
4766   NaN
4767   NaN
Name: TN, Length: 4768, dtype: float64




In [257]:
X_missing_reg.to_csv('aaxxxxx.csv')

In [258]:
X = [X_missing_mean,X_missing_0, X_missing_reg]

mse = []

for x in X:
    estimator = RandomForestRegressor(random_state=0, n_estimators=100) 
    scores = cross_val_score(estimator,x,X_missing_mean,scoring="neg_mean_squared_error",cv=5).mean()
    mse.append(scores * -1)




ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/clarkhu/opt/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/clarkhu/opt/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 331, in fit
    X, y = self._validate_data(
  File "/Users/clarkhu/opt/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/clarkhu/opt/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/Users/clarkhu/opt/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/utils/validation.py", line 899, in check_array
    _assert_all_finite(
  File "/Users/clarkhu/opt/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/utils/validation.py", line 146, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
