In [2]:
# Importing all the required libraries
from IPython.display import display

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split, \
                                    StratifiedKFold

from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
sns.set(style="whitegrid")

import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = np.random.seed(0)

import wandb

In [3]:
%store -r df

In [8]:
# Clustering
df['lon'] = (df['lon'] + 180) % 360 - 180 # converting to 180 scale
dbscan = DBSCAN(eps=50/6371., 
                min_samples=20, 
                algorithm='ball_tree', 
                metric='haversine').fit(np.radians(df[['lat','lon']]))
df['region'] = dbscan.labels_

print(np.unique(dbscan.labels_,return_counts=True))

(array([0, 1, 2, 3, 4, 5, 6]), array([7460, 7460, 7460, 7460, 3730, 7460, 3730]))


In [1]:
def preprocess(train, test, standardize=True):    
    # drop duplicates
    train.drop_duplicates(inplace=True)

    # Extract year, month, and day from the 'time' column
    train['year'] = train.index.year
    test['year'] = test.index.year
    train['month'] = train.index.month
    test['month'] = test.index.month
    train['day'] = train.index.day
    test['day'] = test.index.day

    
    # Splitting the dataset into features (X) and target (y)
    X_train = train.drop(columns=['Label'])
    y_train = train['Label']
    X_test = test

    if standardize:
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()

    # set aside a matrix of temporal features
    X_train_temporal = X_train[['year', 'month', 'day']]
    X_test_temporal = X_test[['year', 'month', 'day']]
    # drop the temporal features from X
    X_train = X_train.drop(columns=['year', 'month', 'day'])
    X_test = X_test.drop(columns=['year', 'month', 'day'])

    # Fit scaler on training data and transform both train and test data
    scaler.fit(X_train)  # Fit only on training data
    X_train_normalized = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_test_normalized = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)


    X_train_temporal = X_train_temporal.reset_index(drop=True)
    X_train_normalized = X_train_normalized.reset_index(drop=True)

    # concatenate X_temporal and X_normalized to get X
    X_train = pd.concat([X_train_temporal, X_train_normalized], axis=1)

    X_test_temporal = X_test_temporal.reset_index(drop=True)
    X_test_normalized = X_test_normalized.reset_index(drop=True)

    # concatenate X_temporal and X_normalized to get X
    X_test = pd.concat([X_test_temporal, X_test_normalized], axis=1)


    # Squaring the features in X_normalized
    for col in X_train_normalized.columns:
        X_train[col + '_squared'] = X_train[col] ** 2

    # Transforming month and day into cyclical features
    X_train['sin_month'] = np.sin(2 * np.pi * X_train['month'] / 12)
    X_train['cos_month'] = np.cos(2 * np.pi * X_train['month'] / 12)
    X_train['sin_day'] = np.sin(2 * np.pi * X_train['day'] / 30)
    X_train['cos_day'] = np.cos(2 * np.pi * X_train['day'] / 30)

    # Dropping the original month and day columns
    X_train = X_train.drop(columns=['month', 'day', 'year'])
    # X = X.drop(columns=['month', 'day'])

    # Squaring the features in X_normalized
    for col in X_test_normalized.columns:
        X_test[col + '_squared'] = X_test[col] ** 2

    # Transforming month and day into cyclical features
    X_test['sin_month'] = np.sin(2 * np.pi * X_test['month'] / 12)
    X_test['cos_month'] = np.cos(2 * np.pi * X_test['month'] / 12)
    X_test['sin_day'] = np.sin(2 * np.pi * X_test['day'] / 30)
    X_test['cos_day'] = np.cos(2 * np.pi * X_test['day'] / 30)

    # Dropping the original month and day columns
    X_test = X_test.drop(columns=['month', 'day', 'year'])
    # X = X.drop(columns=['month', 'day'])

    return X_train, y_train, X_test

In [4]:
# Define CV strategy on the basis of the 'time' column where the last year is used as the test set
(X_train, y_train, X_val), y_val= preprocess(train=df.loc[df.index.year!=2009],
                                      test=df.loc[df.index.year==2009].drop(columns=['Label']),
                                    standardize=True),df.loc[df.index.year==2009,'Label']

In [5]:
%store X_train
%store y_train
%store X_val
%store y_val

Stored 'X_train' (DataFrame)
Stored 'y_train' (Series)
Stored 'X_val' (DataFrame)
Stored 'y_val' (Series)
