In [19]:
import pandas as pd
import numpy as np

def compute_rsi(series, period=14):
    delta = series.diff()

    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)

    # Exponential Moving Average (lebih halus daripada SMA)
    roll_up = pd.Series(gain).ewm(span=period, adjust=False).mean()
    roll_down = pd.Series(loss).ewm(span=period, adjust=False).mean()

    rs = roll_up / roll_down
    rsi = 100 - (100 / (1 + rs))

    return rsi

def compute_tech_var(df, target_cols, lags=[1,3], mas=[3,6], rsi_period=14):
    df = df.copy()

    for col in target_cols:
        # Lag features
        for lag in lags:
            df[f"{col}_lag{lag}"] = df[col].shift(lag)

        # Moving averages
        for ma in mas:
            df[f"{col}_ma{ma}"] = df[col].rolling(window=ma).mean()

        # MoM Growth
        df[f"{col}_mom_growth"] = df[col].pct_change()

        # Rolling Volatility
        for ma in mas:
            df[f"{col}_vol{ma}"] = df[col].rolling(window=ma).std()

        # Market Share
        df[f"{col}_share"] = df[col] / df[target_cols].sum(axis=1)

        # RSI
        df[f"{col}_rsi{rsi_period}"] = compute_rsi(df[col], period=rsi_period)

    return df


In [20]:
%ls

merged_data.csv  [0m[01;34msample_data[0m/


In [21]:
data_link = {
    "kurs": "https://github.com/zzahranez/otomotive-forecasting/blob/main/data/kurs.csv",
    "inflasi": "https://github.com/zzahranez/otomotive-forecasting/blob/main/data/inflasi.csv",
    "bi_rate": "https://github.com/zzahranez/otomotive-forecasting/blob/main/data/bi_rate_data.csv",
    "data_eksternal": "https://github.com/zzahranez/otomotive-forecasting/blob/main/data/data_eksternal_lengkap.csv",
    "dataset_utama": "https://github.com/zzahranez/otomotive-forecasting/blob/main/data/dataCarSale2021-2025.csv",
}

In [22]:
import numpy as np

seed = 404
np.random.seed(seed)
"Done"

'Done'

In [23]:
dataframes = {}
for name, link in data_link.items():
    # Correct the raw link for direct download
    raw_link = link.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
    try:
        dataframes[name] = pd.read_csv(raw_link)
        print(f"Successfully loaded '{name}' data.")
    except Exception as e:
        print(f"Error loading '{name}' data: {e}")

# Display the first few rows of each loaded dataframe
for name, df in dataframes.items():
    print(f"\n--- {name} Data ---")
    display(df.tail(20))

Successfully loaded 'kurs' data.
Successfully loaded 'inflasi' data.
Successfully loaded 'bi_rate' data.
Successfully loaded 'data_eksternal' data.
Successfully loaded 'dataset_utama' data.

--- kurs Data ---


Unnamed: 0,jpy,usd,tanggal
3792,10464,9412,2010-01-29
3793,10469,9455,2010-01-28
3794,10551,9427,2010-01-27
3795,10363,9362,2010-01-26
3796,10411,9387,2010-01-25
3797,10482,9435,2010-01-22
3798,10238,9366,2010-01-21
3799,10227,9321,2010-01-20
3800,10232,9271,2010-01-19
3801,10224,9276,2010-01-18



--- inflasi Data ---


Unnamed: 0,date,inflasi
167,2011-08-01,4.79
168,2011-07-01,4.61
169,2011-06-01,5.54
170,2011-05-01,5.98
171,2011-04-01,6.16
172,2011-03-01,6.65
173,2011-02-01,6.84
174,2011-01-01,7.02
175,2010-12-01,6.96
176,2010-11-01,6.33



--- bi_rate Data ---


Unnamed: 0,Tanggal,BI Rate
172,01/5/2024,6.25
173,01/6/2024,6.25
174,01/7/2024,6.25
175,01/8/2024,6.25
176,01/9/2024,6
177,01/10/2024,6
178,01/11/2024,6
179,01/12/2024,6
180,01/1/2025,5.75
181,01/2/2025,5.75



--- data_eksternal Data ---


Unnamed: 0,waktu,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA
112,2019-05-01,14178.0,11048.0,11761.0,8775.0,29103.0
113,2019-06-01,6020.0,7563.0,11968.0,7236.0,18541.0
114,2019-07-01,13013.0,13894.0,14600.0,8172.0,29362.0
115,2019-08-01,15415.0,11909.0,14240.0,8622.0,28929.0
116,2019-09-01,17537.0,12431.0,12712.0,8118.0,31831.0
117,2019-10-01,17530.0,14503.0,13983.0,8721.0,30944.0
118,2019-11-01,15911.0,13210.0,11974.0,10077.0,28970.0
119,2019-12-01,10855.0,12305.0,13327.0,10116.0,27453.0
120,2020-01-01,14173.0,12777.0,12295.0,10512.0,24119.0
121,2020-02-01,15778.0,11373.0,10903.0,8613.0,25053.0



--- dataset_utama Data ---


Unnamed: 0,waktu,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA
34,2023-11-01,7466.0,10307.0,6600.0,6602.0,24622.0
35,2024-01-01,7348.196,3222.119,5076.613,3242.0,10366.353
36,2024-02-01,6021.329,3829.318,4349.666,2999.0,10770.845
37,2024-03-01,6591.92,5212.549,4708.298,3503.0,12301.583
38,2024-04-01,7510.349,3435.0,4898.0,2555.0,9116.144
39,2024-05-01,9336.778,5055.0,5522.211,2736.0,9445.956
40,2024-06-01,7717.478,2875.069,4931.16,2198.0,10798.472
41,2024-07-01,7908.271,3703.054,5335.011,2557.0,13457.905
42,2024-08-01,7984.985,3574.084,6498.0,2582.0,11031.562
43,2024-09-01,7941.067,4418.0,6685.0,2677.0,9911.21


In [24]:
import pandas as pd

# --- Load data kurs ---
# Pastikan sudah terisi dari dataframes['kurs']
df_kurs = dataframes['kurs'].copy()

# Bersihkan kolom jpy (ada tanda koma)
df_kurs["jpy"] = df_kurs["jpy"].astype(str).str.replace(",", ".").astype(float)
df_kurs["usd"] = df_kurs["usd"].astype(float)
df_kurs["tanggal"] = pd.to_datetime(df_kurs["tanggal"])

# Agregasi kurs per bulan (ambil rata-rata)
kurs_monthly = (
    df_kurs.groupby(df_kurs["tanggal"].dt.to_period("M"))
    [["jpy", "usd"]]
    .mean()
    .reset_index()
)
kurs_monthly["tanggal"] = kurs_monthly["tanggal"].dt.to_timestamp()

# --- Load data inflasi ---
# Pastikan sudah terisi dari dataframes['inflasi']
df_inflasi = dataframes['inflasi'].copy()
df_inflasi["date"] = pd.to_datetime(df_inflasi["date"])

# Data inflasi sudah per bulan → hanya perlu diurutkan ascending
inflasi_monthly = df_inflasi.sort_values("date").reset_index(drop=True)

# --- Cek hasil ---
print("\n=== Kurs Monthly (aggregated) ===")
print(kurs_monthly.head())

print("\n=== Inflasi Monthly (sorted ascending) ===")
print(inflasi_monthly.head())



=== Kurs Monthly (aggregated) ===
     tanggal           jpy          usd
0 2010-01-01  10220.055000  9321.950000
1 2010-02-01  10405.052632  9395.105263
2 2010-03-01  10187.445455  9219.681818
3 2010-04-01   9711.700000  9072.333333
4 2010-05-01  10020.826316  9229.157895

=== Inflasi Monthly (sorted ascending) ===
        date  inflasi
0 2010-01-01     3.72
1 2010-02-01     3.81
2 2010-03-01     3.43
3 2010-04-01     3.91
4 2010-05-01     4.16


In [25]:
dataframes['kurs'] = kurs_monthly.copy()
dataframes['inflasi'] = inflasi_monthly.copy()

In [26]:
for key in dataframes:
  display(dataframes[key].head(10))

Unnamed: 0,tanggal,jpy,usd
0,2010-01-01,10220.055,9321.95
1,2010-02-01,10405.052632,9395.105263
2,2010-03-01,10187.445455,9219.681818
3,2010-04-01,9711.7,9072.333333
4,2010-05-01,10020.826316,9229.157895
5,2010-06-01,10111.295455,9194.0
6,2010-07-01,10376.954545,9094.454545
7,2010-08-01,10553.047619,9016.761905
8,2010-09-01,10688.473684,9020.842105
9,2010-10-01,10958.714286,8972.904762


Unnamed: 0,date,inflasi
0,2010-01-01,3.72
1,2010-02-01,3.81
2,2010-03-01,3.43
3,2010-04-01,3.91
4,2010-05-01,4.16
5,2010-06-01,5.05
6,2010-07-01,6.22
7,2010-08-01,6.44
8,2010-09-01,5.8
9,2010-10-01,5.67


Unnamed: 0,Tanggal,BI Rate
0,01/1/2010,6.5
1,01/2/2010,6.5
2,01/3/2010,6.5
3,01/4/2010,6.5
4,01/5/2010,6.5
5,01/6/2010,6.5
6,01/7/2010,6.5
7,01/8/2010,6.5
8,01/9/2010,6.5
9,01/10/2010,6.5


Unnamed: 0,waktu,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA
0,2010-01-01,8302.0,3755.0,6855.0,4815.0,20798.0
1,2010-02-01,7518.0,4431.0,8246.0,4638.0,21753.0
2,2010-03-01,8084.0,5179.0,9068.0,6203.0,26222.0
3,2010-04-01,9298.0,5154.0,9560.0,6013.0,24381.0
4,2010-05-01,9485.0,4439.0,9544.0,6043.0,21024.0
5,2010-06-01,11886.0,5965.0,8875.0,6704.0,26006.0
6,2010-07-01,11524.0,6006.0,9586.0,5623.0,27737.0
7,2010-08-01,9741.0,6025.0,9310.0,6063.0,22638.0
8,2010-09-01,6652.0,3504.0,7299.0,4924.0,18413.0
9,2010-10-01,10708.0,5970.0,9407.0,7015.0,25322.0


Unnamed: 0,waktu,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA
0,2021-01-01,8993.0,7231.0,9108.0,6400.0,15474.0
1,2021-02-01,9412.0,6812.0,7567.0,4600.0,14645.0
2,2021-03-01,16770.0,11350.0,13088.0,8669.0,26034.0
3,2021-04-01,15861.0,8474.0,13820.0,8100.0,22618.0
4,2021-05-01,8310.0,5832.0,10160.0,5341.0,17361.0
5,2021-06-01,15481.0,8782.0,8277.0,8965.0,22310.0
6,2021-07-01,9284.0,9030.0,13453.0,6274.0,20950.0
7,2021-08-01,14408.0,7337.0,13720.0,10021.0,28418.0
8,2021-09-01,17529.0,3453.0,12200.0,7515.0,32596.0
9,2021-10-01,17020.0,5861.0,13109.0,7624.0,20633.0


In [27]:
# Select the two dataframes to combine
df_eksternal = dataframes['data_eksternal'].copy()
df_utama = dataframes['dataset_utama'].copy()

# Standardize date column names and convert to datetime
df_eksternal = df_eksternal.rename(columns={'waktu': 'date'})
df_eksternal['date'] = pd.to_datetime(df_eksternal['date'])

df_utama = df_utama.rename(columns={'waktu': 'date'})
df_utama['date'] = pd.to_datetime(df_utama['date'])

# Concatenate the dataframes, placing df_utama below df_eksternal
combined_df = pd.concat([df_eksternal, df_utama], ignore_index=True)

# Sort by date
combined_df = combined_df.sort_values('date').reset_index(drop=True)

print("Combined DataFrame (data_eksternal on top, dataset_utama below):")
display(combined_df.head())
display(combined_df.tail())

Combined DataFrame (data_eksternal on top, dataset_utama below):


Unnamed: 0,date,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA
0,2010-01-01,8302.0,3755.0,6855.0,4815.0,20798.0
1,2010-02-01,7518.0,4431.0,8246.0,4638.0,21753.0
2,2010-03-01,8084.0,5179.0,9068.0,6203.0,26222.0
3,2010-04-01,9298.0,5154.0,9560.0,6013.0,24381.0
4,2010-05-01,9485.0,4439.0,9544.0,6043.0,21024.0


Unnamed: 0,date,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA
181,2025-03-01,6371.692,2382.924,7380.0,3223.22,13711.773
182,2025-04-01,6075.811,1590.411,5197.0,2767.379,13779.3
183,2025-05-01,7537.632,3148.0,5897.0,2004.813,9313.266
184,2025-06-01,5992.367,2360.797,6501.0,3499.336,8712.383
185,2025-07-01,6354.101,4171.007,6121.0,2666.235,7617.122


In [28]:
# Load the original bi_rate dataframe
bi_rate_df = dataframes['bi_rate'].copy()

# Convert 'Tanggal' to datetime, specifying the original format
bi_rate_df['Tanggal'] = pd.to_datetime(bi_rate_df['Tanggal'], format='%d/%m/%Y', errors='coerce')

# Set the date to the first day of the month and format as YYYY-MM-DD
bi_rate_df['Tanggal'] = bi_rate_df['Tanggal'].dt.to_period('M').dt.to_timestamp('D') + pd.offsets.MonthBegin(0)
bi_rate_df['Tanggal'] = bi_rate_df['Tanggal'].dt.strftime('%Y-%m-%d')

# Handle potential non-numeric 'BI Rate' values by coercing to numeric
bi_rate_df['BI Rate'] = pd.to_numeric(bi_rate_df['BI Rate'], errors='coerce')

# Update the dataframe in the dataframes dictionary
dataframes['bi_rate'] = bi_rate_df

print("Processed dataframes['bi_rate'] with 'Tanggal' in YYYY-MM-DD format:")
display(dataframes['bi_rate'].head())

Processed dataframes['bi_rate'] with 'Tanggal' in YYYY-MM-DD format:


Unnamed: 0,Tanggal,BI Rate
0,2010-01-01,6.5
1,2010-02-01,6.5
2,2010-03-01,6.5
3,2010-04-01,6.5
4,2010-05-01,6.5


In [29]:
kurs = dataframes['kurs'].copy()
inflasi = dataframes['inflasi'].copy()
bi_rate = dataframes['bi_rate'].copy()
data = combined_df.copy()

In [30]:
def combineDf(dfs):
  return pd.concat(dfs, ignore_index=True)

In [31]:
dfs = [kurs, inflasi, bi_rate, data]

In [32]:
# Standardize date columns and names before merging

# kurs
kurs_processed = dfs[0].copy()
kurs_processed = kurs_processed.rename(columns={'tanggal': 'date'})
# Ensure date is datetime and set to first of month if needed (assuming it's already monthly from previous steps)
kurs_processed['date'] = pd.to_datetime(kurs_processed['date'])


# inflasi
inflasi_processed = dfs[1].copy()
# Ensure date is datetime and set to first of month if needed (assuming it's already monthly from previous steps)
inflasi_processed['date'] = pd.to_datetime(inflasi_processed['date'])


# bi_rate
bi_rate_processed = dfs[2].copy()
bi_rate_processed = bi_rate_processed.rename(columns={'Tanggal': 'date'})
# Ensure date is datetime and set to first of month if needed (assuming it's already processed in 8xgidzZ0qaHj)
bi_rate_processed['date'] = pd.to_datetime(bi_rate_processed['date'])
# Handle potential duplicates after standardizing date
bi_rate_processed['BI Rate'] = pd.to_numeric(bi_rate_processed['BI Rate'], errors='coerce')
bi_rate_processed = bi_rate_processed.groupby('date')['BI Rate'].mean().reset_index()


# data (combined_df)
data_processed = dfs[3].copy()
# Ensure date is datetime and set to first of month if needed (assuming it's already processed in h7AI2EbBM80d)
data_processed['date'] = pd.to_datetime(data_processed['date'])
# Handle potential duplicates after standardizing date
# Assuming all columns except date should be averaged if dates are duplicated
numeric_cols = data_processed.select_dtypes(include=np.number).columns.tolist()
data_processed = data_processed.groupby('date')[numeric_cols].mean().reset_index()


# Merge all dataframes
merged_all_df = kurs_processed
merged_all_df = pd.merge(merged_all_df, inflasi_processed, on='date', how='outer')
merged_all_df = pd.merge(merged_all_df, bi_rate_processed, on='date', how='outer')
merged_all_df = pd.merge(merged_all_df, data_processed, on='date', how='outer')

# Sort by date
merged_all_df = merged_all_df.sort_values('date').reset_index(drop=True)

print("Merged DataFrame (all dataframes from dfs list):")
display(merged_all_df.head(10))
display(merged_all_df.tail(10))

Merged DataFrame (all dataframes from dfs list):


Unnamed: 0,date,jpy,usd,inflasi,BI Rate,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA
0,2010-01-01,10220.055,9321.95,3.72,6.5,8302.0,3755.0,6855.0,4815.0,20798.0
1,2010-02-01,10405.052632,9395.105263,3.81,6.5,7518.0,4431.0,8246.0,4638.0,21753.0
2,2010-03-01,10187.445455,9219.681818,3.43,6.5,8084.0,5179.0,9068.0,6203.0,26222.0
3,2010-04-01,9711.7,9072.333333,3.91,6.5,9298.0,5154.0,9560.0,6013.0,24381.0
4,2010-05-01,10020.826316,9229.157895,4.16,6.5,9485.0,4439.0,9544.0,6043.0,21024.0
5,2010-06-01,10111.295455,9194.0,5.05,6.5,11886.0,5965.0,8875.0,6704.0,26006.0
6,2010-07-01,10376.954545,9094.454545,6.22,6.5,11524.0,6006.0,9586.0,5623.0,27737.0
7,2010-08-01,10553.047619,9016.761905,6.44,6.5,9741.0,6025.0,9310.0,6063.0,22638.0
8,2010-09-01,10688.473684,9020.842105,5.8,6.5,6652.0,3504.0,7299.0,4924.0,18413.0
9,2010-10-01,10958.714286,8972.904762,5.67,6.5,10708.0,5970.0,9407.0,7015.0,25322.0


Unnamed: 0,date,jpy,usd,inflasi,BI Rate,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA
182,2025-03-01,11097.894737,16538.526316,1.03,5.75,6371.692,2382.924,7380.0,3223.22,13711.773
183,2025-04-01,11771.125,16904.0,1.95,5.75,6075.811,1590.411,5197.0,2767.379,13779.3
184,2025-05-01,11455.470588,16522.823529,1.6,5.5,7537.632,3148.0,5897.0,2004.813,9313.266
185,2025-06-01,11343.555556,16391.944444,1.87,5.5,5992.367,2360.797,6501.0,3499.336,8712.383
186,2025-07-01,11154.869565,16357.782609,2.37,5.25,6354.101,4171.007,6121.0,2666.235,7617.122
187,2025-08-01,,,,,,,,,
188,2025-09-01,,,,,,,,,
189,2025-10-01,,,,,,,,,
190,2025-11-01,,,,,,,,,
191,2025-12-01,,,,,,,,,


In [33]:
# Save the merged dataframe to a CSV file
merged_all_df.to_csv('merged_data.csv', index=False)

print("merged_all_df saved to /tmp/merged_data.csv")

merged_all_df saved to /tmp/merged_data.csv


In [34]:
processed_df = compute_tech_var(merged_all_df, ["TOYOTA", "DAIHATSU","MITSUBISHI","HONDA","SUZUKI"])

  df[f"{col}_mom_growth"] = df[col].pct_change()
  df[f"{col}_mom_growth"] = df[col].pct_change()
  df[f"{col}_mom_growth"] = df[col].pct_change()
  df[f"{col}_mom_growth"] = df[col].pct_change()
  df[f"{col}_mom_growth"] = df[col].pct_change()


In [35]:
import pandas as pd

# Set pandas options to display all columns
pd.set_option('display.max_columns', None)

# Display the head of the processed_df
display(processed_df.head())
display(processed_df.head())

Unnamed: 0,date,jpy,usd,inflasi,BI Rate,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA,TOYOTA_lag1,TOYOTA_lag3,TOYOTA_ma3,TOYOTA_ma6,TOYOTA_mom_growth,TOYOTA_vol3,TOYOTA_vol6,TOYOTA_share,TOYOTA_rsi14,DAIHATSU_lag1,DAIHATSU_lag3,DAIHATSU_ma3,DAIHATSU_ma6,DAIHATSU_mom_growth,DAIHATSU_vol3,DAIHATSU_vol6,DAIHATSU_share,DAIHATSU_rsi14,MITSUBISHI_lag1,MITSUBISHI_lag3,MITSUBISHI_ma3,MITSUBISHI_ma6,MITSUBISHI_mom_growth,MITSUBISHI_vol3,MITSUBISHI_vol6,MITSUBISHI_share,MITSUBISHI_rsi14,HONDA_lag1,HONDA_lag3,HONDA_ma3,HONDA_ma6,HONDA_mom_growth,HONDA_vol3,HONDA_vol6,HONDA_share,HONDA_rsi14,SUZUKI_lag1,SUZUKI_lag3,SUZUKI_ma3,SUZUKI_ma6,SUZUKI_mom_growth,SUZUKI_vol3,SUZUKI_vol6,SUZUKI_share,SUZUKI_rsi14
0,2010-01-01,10220.055,9321.95,3.72,6.5,8302.0,3755.0,6855.0,4815.0,20798.0,,,,,,,,0.467108,,,,,,,,,0.186457,,,,,,,,,0.153958,,,,,,,,,0.084335,,,,,,,,,0.108141,
1,2010-02-01,10405.052632,9395.105263,3.81,6.5,7518.0,4431.0,8246.0,4638.0,21753.0,20798.0,,,,0.045918,,,0.466943,100.0,8302.0,,,,-0.094435,,,0.161379,0.0,6855.0,,,,0.202918,,,0.177006,100.0,3755.0,,,,0.180027,,,0.095114,100.0,4815.0,,,,-0.03676,,,0.099558,0.0
2,2010-03-01,10187.445455,9219.681818,3.43,6.5,8084.0,5179.0,9068.0,6203.0,26222.0,21753.0,,22924.333333,,0.205443,2895.506922,,0.478888,100.0,7518.0,,7968.0,,0.075286,404.667765,,0.147637,45.444813,8246.0,,8056.333333,,0.099685,1118.625198,,0.165607,100.0,4431.0,,4455.0,,0.168811,712.303306,,0.094583,100.0,4638.0,,5218.666667,,0.33743,857.039283,,0.113284,91.073091
3,2010-04-01,9711.7,9072.333333,3.91,6.5,9298.0,5154.0,9560.0,6013.0,24381.0,26222.0,20798.0,24118.666667,,-0.070208,2246.019665,,0.448131,71.375015,8084.0,8302.0,8300.0,,0.150173,909.445985,,0.1709,74.323277,9068.0,6855.0,8958.0,,0.054257,663.870469,,0.175716,100.0,5179.0,3755.0,4921.333333,,-0.004827,424.825062,,0.094732,97.883182,6203.0,4815.0,5618.0,,-0.03063,854.005269,,0.110521,80.768742
4,2010-05-01,10020.826316,9229.157895,4.16,6.5,9485.0,4439.0,9544.0,6043.0,21024.0,24381.0,21753.0,23875.666667,,-0.137689,2635.587664,,0.416028,44.546204,9298.0,7518.0,8955.666667,,0.020112,760.653885,,0.187692,76.531273,9560.0,8246.0,9390.666667,,-0.001674,279.552023,,0.188859,99.185876,5154.0,4431.0,4924.0,,-0.138727,420.208282,,0.08784,57.627494,6013.0,4638.0,6086.333333,,0.004989,102.14369,,0.11958,81.157154


Unnamed: 0,date,jpy,usd,inflasi,BI Rate,DAIHATSU,HONDA,MITSUBISHI,SUZUKI,TOYOTA,TOYOTA_lag1,TOYOTA_lag3,TOYOTA_ma3,TOYOTA_ma6,TOYOTA_mom_growth,TOYOTA_vol3,TOYOTA_vol6,TOYOTA_share,TOYOTA_rsi14,DAIHATSU_lag1,DAIHATSU_lag3,DAIHATSU_ma3,DAIHATSU_ma6,DAIHATSU_mom_growth,DAIHATSU_vol3,DAIHATSU_vol6,DAIHATSU_share,DAIHATSU_rsi14,MITSUBISHI_lag1,MITSUBISHI_lag3,MITSUBISHI_ma3,MITSUBISHI_ma6,MITSUBISHI_mom_growth,MITSUBISHI_vol3,MITSUBISHI_vol6,MITSUBISHI_share,MITSUBISHI_rsi14,HONDA_lag1,HONDA_lag3,HONDA_ma3,HONDA_ma6,HONDA_mom_growth,HONDA_vol3,HONDA_vol6,HONDA_share,HONDA_rsi14,SUZUKI_lag1,SUZUKI_lag3,SUZUKI_ma3,SUZUKI_ma6,SUZUKI_mom_growth,SUZUKI_vol3,SUZUKI_vol6,SUZUKI_share,SUZUKI_rsi14
0,2010-01-01,10220.055,9321.95,3.72,6.5,8302.0,3755.0,6855.0,4815.0,20798.0,,,,,,,,0.467108,,,,,,,,,0.186457,,,,,,,,,0.153958,,,,,,,,,0.084335,,,,,,,,,0.108141,
1,2010-02-01,10405.052632,9395.105263,3.81,6.5,7518.0,4431.0,8246.0,4638.0,21753.0,20798.0,,,,0.045918,,,0.466943,100.0,8302.0,,,,-0.094435,,,0.161379,0.0,6855.0,,,,0.202918,,,0.177006,100.0,3755.0,,,,0.180027,,,0.095114,100.0,4815.0,,,,-0.03676,,,0.099558,0.0
2,2010-03-01,10187.445455,9219.681818,3.43,6.5,8084.0,5179.0,9068.0,6203.0,26222.0,21753.0,,22924.333333,,0.205443,2895.506922,,0.478888,100.0,7518.0,,7968.0,,0.075286,404.667765,,0.147637,45.444813,8246.0,,8056.333333,,0.099685,1118.625198,,0.165607,100.0,4431.0,,4455.0,,0.168811,712.303306,,0.094583,100.0,4638.0,,5218.666667,,0.33743,857.039283,,0.113284,91.073091
3,2010-04-01,9711.7,9072.333333,3.91,6.5,9298.0,5154.0,9560.0,6013.0,24381.0,26222.0,20798.0,24118.666667,,-0.070208,2246.019665,,0.448131,71.375015,8084.0,8302.0,8300.0,,0.150173,909.445985,,0.1709,74.323277,9068.0,6855.0,8958.0,,0.054257,663.870469,,0.175716,100.0,5179.0,3755.0,4921.333333,,-0.004827,424.825062,,0.094732,97.883182,6203.0,4815.0,5618.0,,-0.03063,854.005269,,0.110521,80.768742
4,2010-05-01,10020.826316,9229.157895,4.16,6.5,9485.0,4439.0,9544.0,6043.0,21024.0,24381.0,21753.0,23875.666667,,-0.137689,2635.587664,,0.416028,44.546204,9298.0,7518.0,8955.666667,,0.020112,760.653885,,0.187692,76.531273,9560.0,8246.0,9390.666667,,-0.001674,279.552023,,0.188859,99.185876,5154.0,4431.0,4924.0,,-0.138727,420.208282,,0.08784,57.627494,6013.0,4638.0,6086.333333,,0.004989,102.14369,,0.11958,81.157154


In [37]:
!pip install u8darts

Collecting u8darts
  Downloading u8darts-0.37.1-py3-none-any.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting nfoursid>=1.0.0 (from u8darts)
  Downloading nfoursid-1.0.2-py3-none-any.whl.metadata (1.9 kB)
Collecting pyod>=0.9.5 (from u8darts)
  Downloading pyod-2.0.5-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting statsforecast>=1.4 (from u8darts)
  Downloading statsforecast-2.0.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting coreforecast>=0.0.12 (from statsforecast>=1.4->u8darts)
  Downloading coreforeca

In [38]:
!pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.5.4-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading pytorch_lightning-2.5.4-py3-none-any.whl (829 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m829.2/829.2 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.15.2 pytorch_lightning-2.5.4 torchmetrics-1.8.2


In [39]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
from darts.models import LightGBMModel
from darts.metrics import mape, smape


In [56]:
# === 1. Load data
df = processed_df.copy()
# df = df.set_index("date")
df = df.dropna()

In [90]:
# ==== 0) Prep data bersih (sesuai patch-mu sebelumnya) ====
dfc = df.copy().sort_values('date').set_index('date')
dfc = dfc.apply(pd.to_numeric, errors='coerce').replace([np.inf, -np.inf], np.nan)
dfc = dfc.interpolate(method='time', limit_direction='both').ffill().bfill()


In [91]:
# deteksi kolom yang punya +/-inf (opsional: print untuk debug)
has_inf = np.isinf(dfc.to_numpy())
if has_inf.any():
    bad_cols = dfc.columns[np.where(has_inf.any(axis=0))[0]].tolist()
    print("Kolom mengandung inf/-inf:", bad_cols)


In [59]:
# ganti +/-inf jadi NaN
dfc = dfc.replace([np.inf, -np.inf], np.nan)

# isi NaN: time interpolation → lalu forward/backward fill untuk ujung-ujung
dfc = dfc.interpolate(method='time', limit_direction='both')
dfc = dfc.ffill().bfill()



In [93]:
# ==== 1) Build TimeSeries ====
ts_y  = TimeSeries.from_dataframe(dfc, value_cols=target_cols, freq="MS")
ts_x  = TimeSeries.from_dataframe(dfc, value_cols=cov_cols,freq="MS") if cov_cols else None



In [95]:
# opsi: drop kolom yang masih terlalu banyak NaN (harusnya sudah terisi, tapi jaga-jaga)
na_ratio = dfc.isna().mean()
if (na_ratio > 0).any():
    drop_cols = na_ratio[na_ratio > 0].index.tolist()
    print("Masih ada NaN, akan diisi 0 sementara pada:", drop_cols)
    dfc[drop_cols] = dfc[drop_cols].fillna(0.0)

# validasi final: semua finite
assert np.isfinite(dfc.to_numpy()).all(), "Masih ada nilai non-finite setelah cleaning"


In [94]:
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler


In [96]:
# ==== 2) Scale ====
scaler_y = Scaler()
ts_y_s = scaler_y.fit_transform(ts_y)

if ts_x is not None:
    scaler_x = Scaler()
    ts_x_s = scaler_x.fit_transform(ts_x)
else:
    ts_x_s = None


In [66]:
from darts.models import LightGBMModel
from darts.metrics import mape


In [97]:
# ==== 3) Split ====
train_y, val_y = ts_y_s.split_before(0.8)
if ts_x_s is not None:
    train_x, val_x = ts_x_s.split_before(0.8)


In [98]:
# ==== 4) Drop near-zero variance SETELAH lag window dianggap (approx) ====
# Kita cek variansi pada window train mentah terlebih dulu untuk berjaga-jaga
def ts_to_df(ts):
    arr = ts.all_values(copy=False)
    if arr.ndim == 3 and arr.shape[2] == 1:
        arr = arr[:, :, 0]
    elif arr.ndim == 3 and arr.shape[2] > 1:
        arr = arr.mean(axis=2)
    elif arr.ndim == 1:
        arr = arr.reshape(-1, 1)
    return pd.DataFrame(arr, index=ts.time_index, columns=list(ts.components))

if ts_x_s is not None:
    x_train_df = ts_to_df(train_x)
    # near-zero variance threshold
    nzv = x_train_df.std(ddof=0) < 1e-8
    drop_cols = nzv[nzv].index.tolist()
    if drop_cols:
        keep_cols = [c for c in train_x.components if c not in drop_cols]
        train_x = train_x[keep_cols]
        val_x   = val_x[keep_cols]
        print(f"Dropped near-zero-variance covariates in TRAIN: {drop_cols}")



In [99]:
# ==== 5) Model: lags target + lags kovariat kontigu (kalau ada) ====
lags_y = 12
lags_x = list(range(-12, 0)) if ts_x_s is not None else None

model = LightGBMModel(
    lags=lags_y,
    lags_past_covariates=lags_x,
    output_chunk_length=1,
    learning_rate=0.05,
    num_leaves=31,          # sedikit lebih konservatif
    n_estimators=600,
    min_data_in_leaf=10,    # naikkan sedikit untuk stabilitas
    feature_fraction=1.0,
    bagging_fraction=1.0,
    bagging_freq=0,
    max_depth=-1,
    random_state=42,
    n_jobs=-1,
    multi_models=True,      # satu model per target → lebih stabil
)


In [101]:
model.fit(series=train_y, past_covariates=(train_x if ts_x_s is not None else None))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4480
[LightGBM] [Info] Number of data points in the train set: 132, number of used features: 108
[LightGBM] [Info] Start training from score 0.612670
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4480
[LightGBM] [Info] Number of data points in the train set: 132, number of used features: 108
[LightGBM] [Info] Start training from score 0.687604
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4480
[LightGBM] [Info] Number of data points in the train set: 132, number of used features: 108
[LightGBM] [Info] Start traini

LightGBMModel(lags=12, lags_past_covariates=[-12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1], lags_future_covariates=None, output_chunk_length=1, output_chunk_shift=0, add_encoders=None, likelihood=None, quantiles=None, random_state=42, multi_models=True, use_static_covariates=True, categorical_past_covariates=None, categorical_future_covariates=None, categorical_static_covariates=None, learning_rate=0.05, num_leaves=31, n_estimators=600, min_data_in_leaf=10, feature_fraction=1.0, bagging_fraction=1.0, bagging_freq=0, max_depth=-1, n_jobs=-1)

In [102]:
print(len(val_y))

38


In [104]:
# prediksi sepanjang validation saja
yhat_val = model.predict(
    n=len(val_y),
    past_covariates=ts_x_s  # gunakan full covariates, bukan val_x
)
























In [107]:
from darts.metrics import smape

yhat_inv = scaler_y.inverse_transform(yhat_val)
val_inv  = scaler_y.inverse_transform(val_y)

print("SMAPE (val):", smape(val_inv, yhat_inv))


MAPE (val): 51.788816523152036


In [108]:
model_noexo = LightGBMModel(
    lags=12,
    lags_past_covariates=None,
    output_chunk_length=1,
    learning_rate=0.05,
    num_leaves=31,
    n_estimators=600,
    min_data_in_leaf=10,
    random_state=42,
    n_jobs=-1,
    multi_models=True,
)
model_noexo.fit(series=train_y)  # tanpa past_covariates
yhat = model_noexo.predict(n=len(val_y)+16)  # tak perlu kovariat


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2696
[LightGBM] [Info] Number of data points in the train set: 132, number of used features: 60
[LightGBM] [Info] Start training from score 0.612670
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2696
[LightGBM] [Info] Number of data points in the train set: 132, number of used features: 60
[LightGBM] [Info] Start training from score 0.687604
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2696
[LightGBM] [Info] Number of data points in the train set: 132, number of used features: 60
[LightGBM] [Info] Start training 







In [110]:
yhat_inv = scaler_y.inverse_transform(yhat)
val_inv  = scaler_y.inverse_transform(val_y)

print("SMAPE (val):", smape(val_inv, yhat_inv))


SMAPE (val): 48.0363471759309


In [115]:
from itertools import product
from darts.metrics import smape




In [116]:
from darts.metrics import smape

def _safe_start_index(series, lags_y, lags_x):
    """
    Pastikan titik start punya history minimal sesuai lags_y & lags_x.
    lags_x boleh None.
    """
    max_back = lags_y
    if lags_x is not None and len(lags_x) > 0:
        max_back = max(max_back, abs(min(lags_x)))  # contoh: [-12..-1] => 12
    # tambah 1 untuk jaga-jaga output_chunk_length=1
    offset = max_back + 1
    if len(series) <= offset:
        raise ValueError(f"Series terlalu pendek untuk lags={lags_y} & lags_x={lags_x}.")
    return series.time_index[offset]

def backtest_smape(model, series, past_covariates=None, start_frac=None,
                   lags_y=12, lags_x=None):
    """
    Rolling one-step-ahead with retrain=True.
    - Jika start_frac diberikan, pakai itu *namun* tetap dipaksa ≥ safe_start.
    - Kalau tidak, otomatis pakai start aman.
    """
    # tentukan start aman
    safe_start = _safe_start_index(series, lags_y, lags_x)
    if start_frac is not None:
        frac_idx = series.time_index[int(len(series) * start_frac)]
        # pilih yang lebih lambat (lebih ke kanan) supaya aman
        start_idx = max(safe_start, frac_idx)
    else:
        start_idx = safe_start

    preds = model.historical_forecasts(
        series=series,
        past_covariates=past_covariates,   # PASTIKAN full covariates (ts_x_s) diberikan di sini
        start=start_idx,
        forecast_horizon=1,
        stride=1,
        retrain=True,
        last_points_only=False,            # biar gampang dihitung smape
        verbose=False,
    )
    # Preds bisa berupa multi-variate TimeSeries
    return float(smape(series.slice_intersect(preds), preds))


In [117]:
# pilih apakah pakai exogenous covariates
USE_COVS = ts_x_s is not None
lags_x_grids = [None] if not USE_COVS else [list(range(-6,0)), list(range(-12,0))]

param_grid = {
    "lags_y":        [6, 12, 18],        # panjang memory target
    "lags_x":        lags_x_grids,       # memory exogenous
    "num_leaves":    [31, 63],
    "min_data_in_leaf": [10, 20, 40],
    "learning_rate": [0.05, 0.1],
    "n_estimators":  [400, 800],
}


In [118]:
def build_model(p):
    return LightGBMModel(
        lags=p["lags_y"],
        lags_past_covariates=p["lags_x"],
        output_chunk_length=1,
        learning_rate=p["learning_rate"],
        num_leaves=p["num_leaves"],
        n_estimators=p["n_estimators"],
        min_data_in_leaf=p["min_data_in_leaf"],
        max_depth=-1,
        feature_fraction=1.0,
        bagging_fraction=1.0,
        bagging_freq=0,
        random_state=42,
        n_jobs=-1,
        multi_models=True,    # satu model per target → stabil untuk dataset kecil
    )


In [119]:
best = {"smape": 1e9, "params": None, "model": None}
keys = list(param_grid.keys())
for values in product(*[param_grid[k] for k in keys]):
    p = dict(zip(keys, values))
    model = build_model(p)
    score = backtest_smape(model, ts_y_s, past_covariates=(ts_x_s if USE_COVS else None), start_frac=0.7)
    print(f"{p} → SMAPE={score:.2f}")
    if score < best["smape"]:
        best = {"smape": score, "params": p, "model": model}

print("\nBEST:", best["params"], "SMAPE=", best["smape"])


Output hidden; open in https://colab.research.google.com to view.