In [73]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Load data
train_toy_data=pd.read_csv('/kaggle/input/0dlsw/all.csv',delimiter=' ')
select_cols=['mu_s(imus)','mu_d(imud)','dc(idc)','t','V']
select_df_data=train_toy_data[select_cols]

sns.pairplot(select_df_data, diag_kws={'bins': 10})
plt.show()

In [67]:
# Time-Lag : series_to_supervised()
def series_to_supervised(data,n_in=1,n_out=1,dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df=pd.DataFrame(data)
    cols,names=list(),list()
    for i in range(n_in,0,-1):
        cols.append(df.shift(i))
        names+=[('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    for i in range(0,n_out):
        cols.append(df.shift(-i))
        if i==0:
            names+=[('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names+=[('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    agg=pd.concat(cols, axis=1)
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=False)
    agg[names]=agg[names].fillna(0)
    return agg

t_lag=pd.DataFrame()
test_t_lag=pd.DataFrame()
for it in range(0,125):
    TT=pd.read_csv('/kaggle/input/0dlsw/all.csv',delimiter=' ',skiprows=(it*5000),nrows=5000)
    TT_lag=series_to_supervised(list(TT.iloc[:,6]),3)
    t_lag=t_lag.append(TT_lag)

print(t_lag['var1(t-3)'])
lag_t=np.array(t_lag[['var1(t-3)','var1(t-2)','var1(t-1)']])
print(np.shape(lag_t))

for it in range(0,8):
    test_TT=pd.read_csv('/kaggle/input/testdata/test.csv',delimiter=' ',skiprows=(it*5000),nrows=5000)
    test_TT_lag=series_to_supervised(list(test_TT.iloc[:,6]),3)
    test_t_lag=test_t_lag.append(test_TT_lag)

test_lag_t=np.array(test_t_lag[['var1(t-3)','var1(t-2)','var1(t-1)']])
print(np.shape(test_lag_t))

In [68]:
# Random Forest Regression
# Train data
X_train=np.array(train_toy_data[['mu_s(imus)','mu_d(imud)','dc(idc)','t']])
X_train=np.hstack((X_train,lag_t))
print(np.shape(X_train)) # 625000 = 5000 (t) * 5 (mu_s) * 5 (mu_d) * 5(dc)
# Train target
y_train=np.array(train_toy_data[['V']]).ravel()
print(np.shape(y_train))

# Test data
test_toy_data=pd.read_csv('/kaggle/input/testdata/test.csv',delimiter=' ')
X_test=np.array(test_toy_data[['mu_s(imus)','mu_d(imud)','dc(idc)','t']])
X_test=np.hstack((X_test,test_lag_t))
print(np.shape(X_test)) # 40000 = 5000 (t) * 2 (mu_s) * 2 (mu_d) * 2 (dc)
y_test=np.array(test_toy_data[['V']]).ravel()
print(np.shape(y_test))


In [74]:
# RF model
rfmodel=RandomForestRegressor()
rfmodel.fit(X_train,y_train)

score=rfmodel.score(X_train,y_train)
print("R-squared:", score)

y_pred=rfmodel.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
print("MSE: ", mse)
print("RMSE: ", mse*(1/2.0))


# Change param: https://www.cnblogs.com/pinard/p/6160412.html (手动调参）
#estimators = np.arange(10, 200, 10)
#scores = []
#for n in estimators:
    #rfmodel.set_params(n_estimators=n)
    #rfmodel.fit(X_train, y_train)
    #scores.append(rfmodel.score(X_test, y_test))

In [87]:
plt.figure(figsize=(10,10))
plt.plot(X_test[:5000,3],y_test[:5000],label='groundtruth_1',linewidth=1)
plt.plot(X_test[:5000,3],y_pred[:5000],label='predicted_1',linewidth=2)
plt.plot(X_test[15001:20000,3],y_test[15001:20000],label='groundtruth_2',linewidth=1)
plt.plot(X_test[15001:20000,3],y_pred[15001:20000],label='predicted_2',linewidth=2)
plt.legend()
plt.show()


In [None]:
# Tensorflow for multifeatures doing Linear Regression
# https://blog.csdn.net/zhangchao19890805/article/details/82422333

# But HARD for Linear Regression to predict multi-input-output 
# Instead using LSTM (Long Short-Term Memory) -- RNN (Recurrence Neural Network) in Tensorflow
# https://cloud.tencent.com/developer/article/1041442
