In [338]:
import numpy as np
import pandas as pd

In [339]:
# Load Test data from our github repo
url_test = 'https://raw.githubusercontent.com/tjayada/iANNwTF_Project/main/data/raw%20data/Test.csv'
dataDF_test = pd.read_csv(url_test)

# Load Train data from our github repo
url_train = 'https://raw.githubusercontent.com/tjayada/iANNwTF_Project/main/data/raw%20data/Train.csv'
dataDF_train = pd.read_csv(url_train)

In [340]:
# create data without clomumns 70 - 76 and drop NaN's

# for test date
data_without_columns_test = dataDF_test.copy()
data_without_columns_test = data_without_columns_test.drop(["L3_CH4_CH4_column_volume_mixing_ratio_dry_air", "L3_CH4_aerosol_height", "L3_CH4_aerosol_optical_depth",
                                      "L3_CH4_sensor_azimuth_angle", "L3_CH4_sensor_zenith_angle", "L3_CH4_solar_azimuth_angle", "L3_CH4_solar_zenith_angle"], axis=1)
data_without_columns_test = data_without_columns_test.dropna()

# for train data
data_without_columns_train = dataDF_train.copy()
data_without_columns_train = data_without_columns_train.drop(["L3_CH4_CH4_column_volume_mixing_ratio_dry_air", "L3_CH4_aerosol_height", "L3_CH4_aerosol_optical_depth",
                                      "L3_CH4_sensor_azimuth_angle", "L3_CH4_sensor_zenith_angle", "L3_CH4_solar_azimuth_angle", "L3_CH4_solar_zenith_angle"], axis=1)
data_without_columns_train = data_without_columns_train.dropna()

In [341]:
# remove targets and create labels
data_without_columns_train_labels = data_without_columns_train[['target']]
data_without_columns_train = data_without_columns_train.drop(["target", "target_min", "target_max", "target_variance", "target_count"], axis=1)

In [342]:
# convert dates to datetime 
data_without_columns_test['date'] = pd.to_datetime(data_without_columns_test['Date'],format='%Y-%m-%d')
data_without_columns_train['date'] = pd.to_datetime(data_without_columns_train['Date'],format='%Y-%m-%d')


# add weekdays based on dates
data_without_columns_test['weekday'] = data_without_columns_test['date'].dt.weekday
data_without_columns_train['weekday'] = data_without_columns_train['date'].dt.weekday


# add whether the date is a weekend or not, 0 is no and 1 is yes
data_without_columns_test['weekend'] = np.multiply((data_without_columns_test['date'].dt.weekday >= 5), 1)
data_without_columns_train['weekend'] = np.multiply((data_without_columns_train['date'].dt.weekday >= 5),1)


# add month based on dates
data_without_columns_test['month'] = data_without_columns_test['date'].dt.month
data_without_columns_train['month'] = data_without_columns_train['date'].dt.month


# add which day out of all days in the year it is
data_without_columns_test['dayofyear'] = data_without_columns_test['date'].dt.dayofyear
data_without_columns_train['dayofyear'] = data_without_columns_train['date'].dt.dayofyear


# what quarter is the date part of
data_without_columns_test['quarter'] = data_without_columns_test['date'].dt.quarter
data_without_columns_train['quarter'] = data_without_columns_train['date'].dt.quarter

In [343]:
# drop original dates in datetime and locations
data_without_columns_train = data_without_columns_train.drop(["Place_ID X Date", "Date", "Place_ID", "date"], axis = 1)
data_without_columns_test = data_without_columns_test.drop(["Place_ID X Date", "Date", "Place_ID", "date"], axis = 1)

In [344]:
# normalize data

from sklearn.preprocessing import MinMaxScaler

# for test data
scaler = MinMaxScaler()
scaler.fit(data_without_columns_test)
scaled = scaler.fit_transform(data_without_columns_test)
data_without_columns_test = pd.DataFrame(scaled, columns=data_without_columns_test.columns)


# for train data
scaler = MinMaxScaler()
scaler.fit(data_without_columns_train)
scaled = scaler.fit_transform(data_without_columns_train)
data_without_columns_train = pd.DataFrame(scaled, columns=data_without_columns_train.columns)

In [345]:
# find out most important features through PCA

from sklearn.decomposition import PCA

# just tried around and found out that 6 features are the best match
pca = PCA(n_components=6).fit(data_without_columns_train)

# number of components, in our case 6
number_of_components = pca.components_.shape[0]

# get the index of the most important features
most_important = []
for component in range(number_of_components):
  most_important.append(abs(pca.components_[component]).argmax())

initial_feature_names = data_without_columns_train.columns
# get the names of the most important features
most_important_names = []
for name in range(number_of_components):
  most_important_names.append(initial_feature_names[most_important[name]])

# individual features importance in descending order
for i in range(number_of_components):
  print(most_important_names[i] + " explains " + str(np.round(pca.explained_variance_ratio_[i] * 100)) + "% of the variance \n")

# total explainibility of all six features combined
print("\n All six features together can explain " + str(np.round(sum(pca.explained_variance_ratio_)*100)) + "% of the variance")

L3_SO2_sensor_azimuth_angle explains 29.0% of the variance 

L3_NO2_sensor_zenith_angle explains 17.0% of the variance 

L3_NO2_solar_zenith_angle explains 12.0% of the variance 

weekend explains 8.0% of the variance 

L3_NO2_solar_azimuth_angle explains 7.0% of the variance 

L3_CLOUD_cloud_fraction explains 5.0% of the variance 


 All six features together can explain 78.0% of the variance


In [346]:
# create new columns for previous data of most important features
# in this case we take 1 day before into account

number_of_shifts = 1
for name in most_important_names:
    for i in range(number_of_shifts):
        data_without_columns_train[name+'_of_'+str(i+1)+'_day_before'] = data_without_columns_train[name].shift(i+1)
        data_without_columns_test[name+'_of_'+str(i+1)+'_day_before'] = data_without_columns_test[name].shift(i+1)

In [347]:
# reset index, so we can use it to remove NaN rows in labels
data_without_columns_train.index = np.arange(0, len(data_without_columns_train))
data_without_columns_test.index = np.arange(0, len(data_without_columns_test))
data_without_columns_train_labels.index = np.arange(0, len(data_without_columns_train_labels))

# through shifting we have created NaN's once again, so we need to drop these rows
data_without_columns_train = data_without_columns_train.dropna()
data_without_columns_test = data_without_columns_test.dropna()

# also remove them from the targets
data_without_columns_train_labels = data_without_columns_train_labels.drop(index = np.array((range(number_of_shifts))))

In [348]:
# save the new and clean data
data_without_columns_train.to_csv('data_without_columns_train.csv')
data_without_columns_train_labels.to_csv('data_without_columns_train_labels.csv')
data_without_columns_test.to_csv('data_without_columns_test.csv')