In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats 


from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold


# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PolynomialFeatures,
    FunctionTransformer,
    OrdinalEncoder)

# Load data set

In [None]:
data = pd.read_csv()

## Change Date to Date_DT to datetime format

In [None]:
data['Date_DT'] = pd.to_datetime(data['Date'])
data.drop('Date')

## Dropping unused columns from the start

In [None]:
# Drop the last 7 columns and there are TOO many missing values
list_drop_col = ['L3_CH4_CH4_column_volume_mixing_ratio_dry_air',
       'L3_CH4_aerosol_height', 'L3_CH4_aerosol_optical_depth',
       'L3_CH4_sensor_azimuth_angle', 'L3_CH4_sensor_zenith_angle',
       'L3_CH4_solar_azimuth_angle', 'L3_CH4_solar_zenith_angle']

data = data.drop(list_drop_col, axis='columns')

In [None]:
data = data.drop(columns=[col for col in data.columns if 'azimuth' in col or 'zenith' in col])

In [None]:
data = data.drop(columns=['L3_NO2_sensor_altitude', 'L3_CO_sensor_altitude'])

In [None]:
data.drop('L3_NO2_tropospheric_NO2_column_number_density', axis=1)

## Dropping unused rows

In [None]:
threshold = 60  # for example, rows with more than 3 missing values

# Identify rows with more than `threshold` missing values
rows_with_many_missing = data[data.isnull().sum(axis=1) > threshold]

# View the rows with many missing values
len(rows_with_many_missing)

#drop them
data = data[data.isnull().sum(axis=1) <= threshold]

## Prepare data set to interpolate null values in columns

In [None]:
# Set Multi Index with place and time
data.set_index(['Place_ID', 'Date_DT'], inplace=True)

In [None]:
# List of columns to interpolate
columns_to_interpolate = ['target_variance', 'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_stratospheric_NO2_column_number_density',
       'L3_NO2_tropopause_pressure', 'L3_O3_O3_column_number_density',
       'L3_O3_O3_effective_temperature', 'L3_O3_cloud_fraction',
       'L3_CO_CO_column_number_density', 'L3_CO_H2O_column_number_density',
       'L3_CO_cloud_height', 'L3_HCHO_HCHO_slant_column_number_density',
       'L3_HCHO_cloud_fraction',
       'L3_HCHO_tropospheric_HCHO_column_number_density',
       'L3_HCHO_tropospheric_HCHO_column_number_density_amf',
       'L3_CLOUD_cloud_base_height', 'L3_CLOUD_cloud_base_pressure',
       'L3_CLOUD_cloud_fraction', 'L3_CLOUD_cloud_optical_depth',
       'L3_CLOUD_cloud_top_height', 'L3_CLOUD_cloud_top_pressure',
       'L3_CLOUD_surface_albedo', 'L3_AER_AI_absorbing_aerosol_index',
       'L3_AER_AI_sensor_altitude', 'L3_SO2_SO2_column_number_density',
       'L3_SO2_SO2_column_number_density_amf',
       'L3_SO2_SO2_slant_column_number_density',
       'L3_SO2_absorbing_aerosol_index', 'L3_SO2_cloud_fraction']

# Apply the interpolation for each column in the list
data[columns_to_interpolate] = data.groupby(level='Place_ID')[columns_to_interpolate].transform(lambda group: group.interpolate(method='linear'))

In [None]:
# Reset Index of DF
data = data.reset_index()

In [None]:
data = data.dropna()

In [None]:
# Should have 44 columns here
data.shape

## Pre-processing

In [None]:
# Creating Day and Weekday flags
data['DayOfWeek'] = data['Date_DT'].dt.dayofweek  # Monday=0, Sunday=6
data['IsWeekend'] = data['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekdays

In [None]:
X = data.drop(columns=['Date_DT', 'Place_ID X Date','target', 'target_min','target_max', 'target_variance', 'target_count'])
y = data['target']

In [None]:
num_features = ['precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_stratospheric_NO2_column_number_density',
       'L3_NO2_tropopause_pressure', 'L3_O3_O3_column_number_density',
       'L3_O3_O3_effective_temperature', 'L3_O3_cloud_fraction',
       'L3_CO_CO_column_number_density', 'L3_CO_H2O_column_number_density',
       'L3_CO_cloud_height', 'L3_HCHO_HCHO_slant_column_number_density',
       'L3_HCHO_cloud_fraction',
       'L3_HCHO_tropospheric_HCHO_column_number_density',
       'L3_HCHO_tropospheric_HCHO_column_number_density_amf',
       'L3_CLOUD_cloud_base_height', 'L3_CLOUD_cloud_base_pressure',
       'L3_CLOUD_cloud_fraction', 'L3_CLOUD_cloud_optical_depth',
       'L3_CLOUD_cloud_top_height', 'L3_CLOUD_cloud_top_pressure',
       'L3_CLOUD_surface_albedo', 'L3_AER_AI_absorbing_aerosol_index',
       'L3_AER_AI_sensor_altitude', 'L3_SO2_SO2_column_number_density',
       'L3_SO2_SO2_column_number_density_amf',
       'L3_SO2_SO2_slant_column_number_density',
       'L3_SO2_absorbing_aerosol_index', 'L3_SO2_cloud_fraction']

In [None]:
cat_features = ['Place_ID','DayOfWeek','IsWeekend']

In [None]:
# Continuous numerical features
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# Nominal Categorical features
nominal_cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='infrequent_if_exist', min_frequency=0.01))
])

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('nom', nominal_cat_transformer, cat_features)
    ])