In [47]:
import numpy as np
import pandas as pd

In [48]:
# Load Test data from our github repo
url_test = 'https://raw.githubusercontent.com/tjayada/iANNwTF_Project/main/data/Test.csv'
dataDF_test = pd.read_csv(url_test)

# Load Train data from our github repo
url_train = 'https://raw.githubusercontent.com/tjayada/iANNwTF_Project/main/data/Train.csv'
dataDF_train = pd.read_csv(url_train)

In [49]:
# create data set with mean values as replacement for NaN's

# for test data
data_with_mean_NaN_test = dataDF_test.copy()
data_with_mean_NaN_test = data_with_mean_NaN_test.fillna(data_with_mean_NaN_test.mean())

# for train data
data_with_mean_NaN_train = dataDF_train.copy()
data_with_mean_NaN_train = data_with_mean_NaN_train.fillna(data_with_mean_NaN_train.mean())

  """
  if __name__ == '__main__':


In [50]:
# remove targets and create labels

data_with_mean_NaN_train_labels = data_with_mean_NaN_train[['target']]
data_with_mean_NaN_train = data_with_mean_NaN_train.drop(["target", "target_min", "target_max", "target_variance", "target_count"], axis=1)

In [51]:
# convert dates to datetime 
data_with_mean_NaN_test['date'] = pd.to_datetime(data_with_mean_NaN_test['Date'],format='%Y-%m-%d')
data_with_mean_NaN_train['date'] = pd.to_datetime(data_with_mean_NaN_train['Date'],format='%Y-%m-%d')


# add weekdays based on dates
data_with_mean_NaN_test['weekday'] = data_with_mean_NaN_test['date'].dt.weekday
data_with_mean_NaN_train['weekday'] = data_with_mean_NaN_train['date'].dt.weekday


# add whether the date is a weekend or not, 0 is no and 1 is yes
data_with_mean_NaN_test['weekend'] = np.multiply((data_with_mean_NaN_test['date'].dt.weekday >= 5), 1)
data_with_mean_NaN_train['weekend'] = np.multiply((data_with_mean_NaN_train['date'].dt.weekday >= 5),1)


# add month based on dates
data_with_mean_NaN_test['month'] = data_with_mean_NaN_test['date'].dt.month
data_with_mean_NaN_train['month'] = data_with_mean_NaN_train['date'].dt.month


# add which day out of all days in the year it is
data_with_mean_NaN_test['dayofyear'] = data_with_mean_NaN_test['date'].dt.dayofyear
data_with_mean_NaN_train['dayofyear'] = data_with_mean_NaN_train['date'].dt.dayofyear


# what quarter is the date part of
data_with_mean_NaN_test['quarter'] = data_with_mean_NaN_test['date'].dt.quarter
data_with_mean_NaN_train['quarter'] = data_with_mean_NaN_train['date'].dt.quarter

In [52]:
# drop original dates in datetime and locations
data_with_mean_NaN_train = data_with_mean_NaN_train.drop(["Place_ID X Date", "Date", "Place_ID", "date"], axis = 1)
data_with_mean_NaN_test = data_with_mean_NaN_test.drop(["Place_ID X Date", "Date", "Place_ID", "date"], axis = 1)

In [53]:
# normalize data

from sklearn.preprocessing import MinMaxScaler

# for test data
scaler = MinMaxScaler()
scaler.fit(data_with_mean_NaN_test)
scaled = scaler.fit_transform(data_with_mean_NaN_test)
data_with_mean_NaN_test = pd.DataFrame(scaled, columns=data_with_mean_NaN_test.columns)


# for train data
scaler = MinMaxScaler()
scaler.fit(data_with_mean_NaN_train)
scaled = scaler.fit_transform(data_with_mean_NaN_train)
data_with_mean_NaN_train = pd.DataFrame(scaled, columns=data_with_mean_NaN_train.columns)

In [54]:
# find out most important features through PCA

from sklearn.decomposition import PCA

# just tried around and found out that 6 features are the best match
pca = PCA(n_components=6).fit(data_with_mean_NaN_train)

# number of components, in our case 6
number_of_components = pca.components_.shape[0]

# get the index of the most important features
most_important = []
for component in range(number_of_components):
  most_important.append(abs(pca.components_[component]).argmax())

initial_feature_names = data_with_mean_NaN_train.columns
# get the names of the most important features
most_important_names = []
for name in range(number_of_components):
  most_important_names.append(initial_feature_names[most_important[name]])

# individual features importance in descending order
for i in range(number_of_components):
  print(most_important_names[i] + " explains " + str(np.round(pca.explained_variance_ratio_[i] * 100)) + "% of the variance \n")

# total explainibility of all six features combined
print("\n All six features together can explain " + str(np.round(sum(pca.explained_variance_ratio_)*100)) + "% of the variance")

L3_CLOUD_sensor_azimuth_angle explains 23.0% of the variance 

L3_NO2_sensor_zenith_angle explains 14.0% of the variance 

L3_O3_cloud_fraction explains 12.0% of the variance 

L3_CLOUD_cloud_fraction explains 8.0% of the variance 

weekend explains 8.0% of the variance 

L3_O3_solar_azimuth_angle explains 6.0% of the variance 


 All six features together can explain 71.0% of the variance


In [55]:
# create new columns for previous data of most important features
# in this case we take 1 day before into account

for name in most_important_names:
    for i in range(1):
        data_with_mean_NaN_train[name+'_of_'+str(i+1)+'_day_before'] = data_with_mean_NaN_train[name].shift(i+1)
        data_with_mean_NaN_test[name+'_of_'+str(i+1)+'_day_before'] = data_with_mean_NaN_test[name].shift(i+1)

In [56]:
# through shifting we have created NaN's once again, so we need to drop these rows
data_without_columns_train = data_with_mean_NaN_train.dropna()
data_without_columns_test = data_with_mean_NaN_test.dropna()

In [57]:
# save the new and clean data
data_with_mean_NaN_train.to_csv('data_with_mean_NaN_train.csv')
data_with_mean_NaN_train_labels.to_csv('data_with_mean_NaN_train_labels.csv')
data_with_mean_NaN_test.to_csv('data_with_mean_NaN_test.csv')