In [1]:
import numpy as np
import pandas as pd
import scipy.stats  as stats
from collections import deque
import time
from tqdm import tqdm

# reduce memory
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

# name: name of csv.gz

def naive_forecast(prediction, name):
    def get_couple_group_preds_val(pred, level1, level2):
        df = pred.groupby([level1, level2])[cols].sum()
        q = np.repeat(qs, len(df))
        df = pd.concat([df]*9, axis=0, sort=False)
        df.reset_index(inplace = True)
        y_hat = df.iloc[:,2:30].to_numpy()
        y = df.iloc[:,30:].to_numpy()
        sum_errs = np.sum((y - y_hat)**2, axis=1) # the residuals
        stdev = np.sqrt(1/(len(y)-2) * sum_errs) # the sd of residuals
        # stdev *= 0.5
        df[pred_cols] += (ratios[q]*stdev)[:, None]
        df["id"] = [f"{lev1}_{lev2}_{q:.3f}_validation" for lev1,lev2, q in
                    zip(df[level1].values,df[level2].values, q)]
        df = df[["id"]+list(cols)]
        return df.iloc[:,:29]
    
    def get_group_preds_val(pred, level):
        df = pred.groupby(level)[cols].sum()
        q = np.repeat(qs, len(df))
        df = pd.concat([df]*9, axis=0, sort=False)
        df.reset_index(inplace = True)
        y_hat = df.iloc[:,1:29].to_numpy()
        y = df.iloc[:,29:].to_numpy()
        sum_errs = np.sum((y - y_hat)**2, axis=1) # the residuals
        stdev = np.sqrt(1/(len(y)-2) * sum_errs) # the sd of residuals
        stdev *= 0.4
        df[pred_cols] += (ratios[q]*stdev)[:, None]
        if level != "id":
            df["id"] = [f"{lev}_X_{q:.3f}_validation" for lev, q in zip(df[level].values, q)]
        else:
            df["id"] = [f"{lev.replace('_validation', '')}_{q:.3f}_validation" for lev, q in zip(df[level].values, q)]
        df = df[["id"]+list(cols)]

        return df.iloc[:,:29]

    y_hat_val = prediction.iloc[:len(prediction)//2,:] # y_hat
    y_val = pd.read_csv('./data/sales_train_evaluation.csv').iloc[:,1919:] # the y label
    df_y_val = pd.concat([y_hat_val, y_val], axis=1)
    y_hat_eval = prediction.iloc[len(prediction)//2:,:]

    cols = df_y_val.iloc[:,1:].columns
    pred_cols = df_y_val.iloc[:,1:29].columns
    qs = np.array([0.005,0.025,0.165,0.25, 0.5, 0.75, 0.835, 0.975, 0.995])

    dq = deque()
    dq.append(0.0)
    for q in [0.5, 0.67, 0.95, 0.99]:
        lower, upper = stats.norm.interval(q)
        dq.appendleft(lower)
        dq.append(upper)

    ratios = pd.Series(dq, index=qs).round(3)
    
    sales_val = pd.read_csv("./data/sales_train_validation.csv")
    sub_val = df_y_val.merge(sales_val[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]], on = "id")
    sub_val["_all_"] = "Total"

    sales_eval = pd.read_csv("./data/sales_train_evaluation.csv")
    sub_eval = y_hat_eval.merge(sales_eval[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]], on = "id")
    sub_eval["_all_"] = "Total"

    levels = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id", "_all_"]
    couples = [("state_id", "item_id"),  ("state_id", "dept_id"),("store_id","dept_id"),
                                ("state_id", "cat_id"),("store_id","cat_id")]
    df_val = []
    for level in levels :
        df_val.append(get_group_preds_val(sub_val, level))
        #df_eval.append(get_group_preds_eval(sub_eval, level, stdev))
    for level1,level2 in couples:
        df_val.append(get_couple_group_preds_val(sub_val, level1, level2))
        #df_eval.append(get_couple_group_preds_eval(sub_eval, level1, level2, stdev))
    df_val = pd.concat(df_val, axis=0, sort=False)
    df_val.reset_index(drop=True, inplace=True)
    df = pd.concat([df_val, df_val] , axis=0, sort=False)
    df.reset_index(drop=True, inplace=True)
    df.loc[df.index >= len(df.index)//2, "id"] = df.loc[df.index >= len(df.index)//2, "id"].str.replace(
                                            "_validation", "_evaluation")
    df =reduce_mem(df)
    df.to_csv(name + ".csv.gz", index = False, compression='gzip')
    
# ensembel_result = pd.read_csv('submission_accuray3.csv.gz', compression='gzip')
ensembel_result = pd.read_csv('submission_test_6.csv')
naive_forecast(ensembel_result, 'residual_v4')

-- Mem. usage decreased to 47.07 Mb (72.4% reduction),time spend:0.01 min


In [2]:
import pandas as pd

In [3]:
magic = pd.read_csv('submission_uncertainty_test_1.csv')

In [4]:
residual = pd.read_csv('residual_v4.csv.gz', compression='gzip')

In [5]:
magic.describe()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
count,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,...,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0
mean,10.388919,9.764787,9.669005,9.732119,10.952746,12.939482,13.380063,11.664713,11.260943,10.435256,...,11.545662,13.889824,14.074576,10.867594,10.149304,10.010079,10.024647,11.120575,13.242224,13.181458
std,293.187101,274.922625,271.912069,273.642626,307.456355,363.549387,378.509086,330.310126,318.891769,294.707036,...,325.572943,393.174248,400.792566,307.544457,286.334131,281.996935,282.262683,312.779096,373.421705,373.7454
min,0.002141,0.002088,0.002323,0.002327,0.002041,0.002363,0.002593,0.002413,0.00294,0.002894,...,0.002678,0.002768,0.002777,0.002658,0.002644,0.00265,0.002652,0.00269,0.00278,0.00279
25%,0.247059,0.238041,0.237498,0.23926,0.263645,0.298752,0.306397,0.266278,0.259307,0.250433,...,0.276725,0.31593,0.316771,0.259246,0.249263,0.24887,0.249839,0.272927,0.311659,0.31
50%,0.739159,0.70411,0.700538,0.704034,0.774758,0.89779,0.930606,0.805077,0.778701,0.743674,...,0.818144,0.956846,0.962416,0.775156,0.736204,0.731325,0.73099,0.799477,0.933111,0.930351
75%,2.157067,2.028893,2.01347,2.02225,2.24778,2.650225,2.750374,2.389069,2.293535,2.15777,...,2.375042,2.838293,2.869253,2.258101,2.113337,2.095587,2.093323,2.299227,2.730994,2.727234
max,44839.92515,42146.088463,41732.681168,42005.091928,47273.473632,55848.48637,57750.090342,50346.417977,48603.695898,45039.922724,...,49832.576091,59950.284553,60747.698092,46905.949116,43805.716151,43204.802497,43267.681146,47997.845384,57155.158523,56892.884487


In [6]:
residual.describe()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
count,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,...,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0,771120.0
mean,10.388973,9.764638,9.669049,9.732193,10.952753,12.939699,13.380007,11.66465,11.260606,10.43543,...,11.545461,13.889474,14.074604,10.867595,10.149068,10.010342,10.024856,11.120481,13.242182,13.181547
std,295.740397,278.170291,275.291215,276.958208,309.517058,363.869931,378.35569,331.590656,320.511725,297.207333,...,326.998054,392.604531,400.007477,309.580233,289.123816,284.990808,285.249303,314.660463,373.415458,373.728437
min,-2962.0,-3040.0,-3046.0,-3018.0,-2764.0,-2360.0,-2320.0,-2876.0,-2962.0,-3006.0,...,-2738.0,-2316.0,-2330.0,-2926.0,-3004.0,-3000.0,-2976.0,-2730.0,-2298.0,-2314.0
25%,0.344,0.3325,0.3318,0.334,0.3677,0.4175,0.4292,0.3713,0.3633,0.3496,...,0.3845,0.4412,0.4429,0.3623,0.349825,0.3484,0.3489,0.3801,0.434,0.4326
50%,0.8447,0.7993,0.7954,0.7983,0.879,1.0305,1.067,0.9233,0.8857,0.845,...,0.9277,1.098,1.109,0.8853,0.8335,0.8276,0.827,0.9033,1.069,1.067
75%,2.232,2.105,2.084,2.092,2.328,2.7425,2.854,2.488,2.39,2.238,...,2.455,2.945,2.986,2.334,2.186,2.168,2.166,2.385,2.826,2.832
max,44700.0,42500.0,42140.0,42370.0,46720.0,53820.0,55400.0,49280.0,47800.0,44860.0,...,48830.0,57200.0,57860.0,46430.0,43840.0,43360.0,43420.0,47330.0,54900.0,54700.0


In [9]:
residual.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
771115,WI_2_HOBBIES_0.995_evaluation,632.5,629.5,632.5,638.5,669.0,688.0,680.0,676.0,660.5,...,664.5,686.5,649.0,634.5,632.5,639.0,640.5,670.0,691.0,648.5
771116,WI_2_HOUSEHOLD_0.995_evaluation,1489.0,1472.0,1479.0,1518.0,1873.0,2064.0,1902.0,1785.0,1777.0,...,1719.0,1783.0,1616.0,1485.0,1449.0,1462.0,1509.0,1697.0,1783.0,1600.0
771117,WI_3_FOODS_0.995_evaluation,2816.0,2740.0,2684.0,2666.0,3028.0,3490.0,3502.0,3636.0,3600.0,...,3414.0,4276.0,4228.0,3182.0,2940.0,2858.0,2850.0,3122.0,3548.0,3466.0
771118,WI_3_HOBBIES_0.995_evaluation,660.0,659.0,660.5,659.5,714.5,716.5,692.0,682.0,679.0,...,720.5,725.0,691.5,667.5,665.5,667.5,664.0,719.5,723.0,689.5
771119,WI_3_HOUSEHOLD_0.995_evaluation,1400.0,1363.0,1359.0,1373.0,1566.0,1700.0,1705.0,1565.0,1518.0,...,1556.0,1654.0,1603.0,1407.0,1368.0,1366.0,1381.0,1556.0,1673.0,1610.0


In [10]:
magic.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
771115,WI_2_HOBBIES_0.995_evaluation,347.35015,343.215411,347.357233,355.994595,398.796017,425.643616,414.052807,408.84124,386.785883,...,392.294205,423.016875,370.731824,350.728692,347.556301,356.532661,359.158074,400.35586,429.528575,369.788562
771116,WI_2_HOUSEHOLD_0.995_evaluation,1290.435747,1265.958489,1276.499896,1330.487176,1828.512415,2097.679474,1870.305639,1704.970506,1694.437933,...,1613.240272,1702.239785,1469.147648,1284.232994,1234.119604,1251.812847,1317.91178,1581.788888,1702.247382,1446.062462
771117,WI_3_FOODS_0.995_evaluation,3040.280696,2933.356942,2855.910919,2829.320145,3338.353134,3985.615442,4001.85357,4190.561263,4139.219982,...,3878.265904,5089.543481,5020.90014,3552.482609,3214.778342,3099.531202,3087.041421,3470.262982,4067.626998,3953.002196
771118,WI_3_HOBBIES_0.995_evaluation,330.102225,328.181803,330.520136,329.334149,406.043054,409.102471,374.804186,360.451304,356.500412,...,414.460971,421.235077,374.024096,340.293307,337.659519,340.40383,335.235812,413.454356,418.273549,371.482744
771119,WI_3_HOUSEHOLD_0.995_evaluation,925.940682,874.904406,869.281622,888.181946,1159.868735,1348.078306,1354.025061,1157.787678,1091.811318,...,1144.966774,1283.268479,1211.256657,936.814332,882.170192,879.419636,899.470809,1145.254856,1309.172476,1221.784996
