In [1]:
import pandas as pd
import numpy as np
from polire import IDW
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from time import time
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import os

### Time feature data

In [2]:
def return_data(fold, month, with_scaling):
    train_input = pd.read_csv(
        "../data/beijing-18/time_feature"
        + "/fold"
        + str(fold)
        + "/train_data_"
        + month
        + ".csv.gz"
    )
    test_input = pd.read_csv(
        "../data/beijing-18/time_feature"
        + "/fold"
        + str(fold)
        + "/test_data_"
        + month
        + ".csv.gz"
    )
    test_output = np.array(test_input["PM25_Concentration"])
    train_output = np.array(train_input["PM25_Concentration"])
    train_input = train_input.drop(
        ["station_id", "PM25_Concentration", "time", "filled"], axis=1
    )
    try:
        test_input = test_input.drop(
            ["PM25_Concentration", "station_id", "time", "filled"], axis=1
        )
    except:
        test_input = test_input.drop(["station_id", "time", "filled"], axis=1)
    #     test_output= test_output.drop(['time'],axis=1)
    if with_scaling:
        scaler = MinMaxScaler().fit(train_input)
        #         train_input = scaler.transform(train_input)
        #         test_input = scaler.transform(test_input)
        train_input = pd.DataFrame(
            scaler.transform(train_input), columns=list(train_input.columns)
        )
        test_input = pd.DataFrame(
            scaler.transform(test_input), columns=list(test_input.columns)
        )
    return train_input, train_output, test_input, test_output


def run_model(model, train_input, train_output, test_input, test_output, ret_output):
    model.fit(np.array(train_input), train_output)
    test_pred = model.predict(np.array(test_input))
    err = mean_squared_error(test_pred, test_output, squared=False)
    mae = mean_absolute_error(test_pred, test_output)
    r2 = r2_score(test_pred, test_output)
    if ret_output:
        return (err, mae, r2), test_pred
    else:
        return (err, mae, r2)

In [3]:
model = IDW(exponent=3)

### With scaling

In [4]:
mean_rmse, mean_mae, mean_r2 = 0, 0, 0

In [5]:
for fold in [0, 1, 2]:
    train_input, train_output, test_input, test_output = return_data(
        fold=fold, month="mar", with_scaling=True
    )
    #     print(train_input,test_input)
    print("Fold: ", fold)
    print("Data received")
    init = time()

    (rmse, mae, r2), test_pred = run_model(
        model, train_input, train_output, test_input, test_output, True
    )
    print("RMSE: ", rmse)
    print("MAE: ", mae)
    print("R2 score: ", r2)
    print("Time taken: ", time() - init)
    test_input = pd.read_csv(
        "../data/beijing-18/time_feature"
        + "/fold"
        + str(fold)
        + "/test_data_"
        + "mar"
        + ".csv.gz"
    )
    test_input["prediction"] = test_pred
    if not os.path.exists(
        "../data/beijing-18/" + "results/results_mar" + "/fold" + str(fold)
    ):
        os.makedirs("../data/beijing-18/" + "results/results_mar" + "/fold" + str(fold))
    test_input.to_csv(
        "../data/beijing-18/"
        + "results/results_mar"
        + "/fold"
        + str(fold)
        + "/IDW_scaled.csv.gz"
    )

    mean_rmse += rmse
    mean_mae += mae
    mean_r2 += r2

Fold:  0
Data received
RMSE:  48.460955404528
MAE:  34.10680153803576
R2 score:  -0.15248450639934874
Time taken:  11.136359214782715
Fold:  1
Data received
RMSE:  40.27899401697979
MAE:  29.245241726873026
R2 score:  0.32580534796286775
Time taken:  9.49681568145752
Fold:  2
Data received
RMSE:  56.103710046129606
MAE:  38.31251828027185
R2 score:  -0.819419768988747
Time taken:  10.869717836380005


In [6]:
np.array([mean_rmse, mean_mae, mean_r2]) / 3

array([48.28121982, 33.88818718, -0.21536631])