In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import spearmanr, randint

In [2]:
!pip install tensorflow



In [55]:
# For LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Reshape
from tensorflow.keras.callbacks import EarlyStopping

In [57]:
# Datasets import
X_train = pd.read_csv('/content/X_train.csv')
y_train = pd.read_csv('/content/y_train.csv')

In [45]:
X_train.head()

Unnamed: 0,ID,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,1054,206,FR,0.210099,-0.427458,-0.606523,0.606523,,0.69286,,...,-0.444661,-0.17268,-0.556356,-0.790823,-0.28316,-1.06907,-0.063404,0.339041,0.124552,-0.002445
1,2049,501,FR,-0.022399,-1.003452,-0.022063,0.022063,-0.57352,-1.130838,0.57352,...,-1.183194,-1.2403,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365
2,1924,687,FR,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,...,1.947273,-0.4807,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952
3,297,720,DE,-0.983324,-0.849198,-0.839586,0.839586,-0.27087,0.56323,0.27087,...,-0.976974,-1.114838,-0.50757,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948
4,1101,818,FR,0.143807,-0.617038,-0.92499,0.92499,,0.990324,,...,-0.526267,-0.541465,-0.42455,-1.088158,-1.01156,0.614338,0.729495,0.245109,1.526606,2.614378


In [46]:
y_train.head()

Unnamed: 0,ID,TARGET
0,1054,0.028313
1,2049,-0.112516
2,1924,-0.18084
3,297,-0.260356
4,1101,-0.071733


In [58]:
def fill_missing_with_average(df):
    # Iterate over each column in the DataFrame
    for col in df.columns:
        # Iterate over each row in the column
        for i in range(len(df[col])):
            # Check if the value is missing
            if pd.isnull(df.iloc[i][col]):
                # Find the nearest non-missing value by moving backward
                j = i - 1
                while j >= 0 and pd.isnull(df.iloc[j][col]):
                    j -= 1

                # Find the nearest non-missing value by moving forward
                k = i + 1
                while k < len(df[col]) and pd.isnull(df.iloc[k][col]):
                    k += 1

                # Calculate the average of the nearest non-missing values
                if j >= 0 and k < len(df[col]):
                    average_value = (df.iloc[j][col] + df.iloc[k][col]) / 2
                elif j >= 0:
                    average_value = df.iloc[j][col]
                elif k < len(df[col]):
                    average_value = df.iloc[k][col]
                else:
                    # If no non-missing values are found, set to 0 (or any default value)
                    average_value = 0

                # Fill the missing value with the calculated average
                df.iloc[i, df.columns.get_loc(col)] = average_value

    return df


In [59]:
# Fill_NA, NORMALIZATION, PCA
def preprocessing(df, norm = False, pca = True):
    fill_df = fill_missing_with_average(df)

    if norm == True:
        scaler = StandardScaler()
        df_normalized = scaler.fit_transform(fill_df)
    else:
        df_normalized = fill_df

    if pca == True:
        p = PCA(n_components=0.95)
        df_pca = p.fit_transform(df_normalized)
        return df_pca
    else:
        return df_normalized

In [60]:
X_train_clean = X_train.drop(['COUNTRY'], axis=1)
X_train_clean = preprocessing(X_train_clean, norm=True, pca=True)
X_train_clean = pd.DataFrame(X_train_clean)
X_train_clean.columns = X_train_clean.columns.astype(str)
X_train_clean

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-0.640630,-2.457621,-0.138258,0.083025,0.882886,-0.815684,1.903222,0.321438,-1.308564,-0.636947,0.126082,0.166463,0.274490,-0.347427,0.287609,-0.801382,1.088272,0.574392,0.292136
1,-2.843831,2.609720,-3.120794,-0.909480,-0.833764,0.784113,0.864452,-1.671208,-0.237561,0.266715,-0.744853,-0.090961,0.117535,-0.076573,-0.040422,0.670503,-0.058020,0.192402,0.783366
2,5.250135,-1.206426,-3.274188,1.168444,-1.236565,0.784147,-0.062533,-1.844627,0.498997,0.261321,0.087106,0.230488,0.669683,0.426517,0.126604,-1.130585,-0.339621,-0.552797,-0.817231
3,-3.064845,0.394868,1.415185,-1.149924,0.378656,-1.449373,-2.131067,-0.648556,0.545504,1.269351,0.480728,-0.476070,-0.912257,1.107326,0.557240,0.497292,0.669229,-0.014263,0.274047
4,-1.205818,-2.254471,0.262835,0.357092,-1.713137,-2.105383,2.005003,-0.041701,0.801543,-0.502936,-1.356709,-0.494035,-0.176831,0.766078,1.212985,0.380801,-0.436173,0.050808,-0.171506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489,2.566561,-3.206121,2.895239,1.940730,-2.205050,0.476767,-0.071460,-0.931400,0.609107,0.037677,-0.244620,-1.416734,-1.445069,-0.765235,0.230859,0.360447,-0.168928,1.007930,0.818277
1490,5.496024,-1.897830,0.184865,0.463033,-1.254102,0.140039,0.263627,-0.815119,0.881643,1.281871,0.583338,0.578649,0.046334,-0.257501,-0.460200,0.489037,0.551928,0.677300,0.130777
1491,1.302956,-1.899848,-2.246987,0.956665,-1.070554,2.637751,-0.567944,-0.029835,1.704005,-0.438307,0.105844,-1.078030,-0.256771,0.171076,0.413673,-0.069189,-0.198605,-0.196659,-0.461090
1492,-0.882623,-1.042309,1.708781,-3.623265,-0.498936,1.235204,1.093580,-0.938695,1.551202,1.475407,2.821537,2.366969,-2.361907,0.958598,0.004847,0.235614,-1.020495,0.437622,0.452345


In [61]:
Y_train_clean = y_train['TARGET']
Y_train_clean

0       0.028313
1      -0.112516
2      -0.180840
3      -0.260356
4      -0.071733
          ...   
1489   -0.172597
1490   -0.063546
1491    0.151797
1492   -0.640917
1493   -0.252764
Name: TARGET, Length: 1494, dtype: float64

# Function for Spearman correlation

In [62]:
# Print out Spearman correlation for the train set
def metric_train(output, Y):
    metric = spearmanr(output, Y).correlation
    print('Spearman correlation for the train set: {:.1f}%'.format(100 * metric ))
    return metric

#LSTM Model

In [63]:
# Create sequences
def create_sequences(X, y=None, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        if y is not None:
            ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys) if y is not None else None

In [64]:
time_steps = 10
X_train_seq, Y_train_seq = create_sequences(X_train_clean, Y_train_clean, time_steps)

In [79]:
# Define LSTM Model
model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.fit(X_train_seq, Y_train_seq, epochs=100, verbose=1, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7e6eea44b700>

In [80]:
output_train = model.predict(X_train_seq)
result_train = metric_train(output_train.flatten(), Y_train_seq)

Spearman correlation for the train set: 86.9%


# Generate Submission file

In [74]:
X_test = pd.read_csv('/content/X_test.csv')

In [75]:
X_test = X_test.drop(['COUNTRY'], axis=1)
X_test_clean = preprocessing(X_test, norm=True, pca=True)

In [76]:
X_test_seq, _ = create_sequences(X_test_clean, time_steps=time_steps)
test_predictions = model.predict(X_test_seq)
aligned_predictions = np.zeros(len(X_test_clean))
aligned_predictions[time_steps:] = test_predictions[:, 0]



In [77]:
# Define submission file
def submission(X_test, predictions, model=None, name=''):
    Y_test_submission = X_test[['ID']].copy()
    Y_test_submission['TARGET'] = predictions
    Y_test_submission.to_csv(name + '.csv', index=False)

In [78]:
submission(X_test, aligned_predictions, model, 'LSTM')