In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

import eli5
from eli5.sklearn import PermutationImportance

import tensorflow as tf
from tensorflow import keras

In [None]:
def filter_years(df):
    """
    filters years 2013 and 2018 which we have to handle.
    """
    df['Date'] = pd.to_datetime(df['Date'])
    df = df[df['Date'].dt.year.isin([2013, 2018])]
    return df

weather = pd.read_csv("weatherAUS.csv")  # read csv data into pandas data frame
weather = filter_years(weather)
weather['Date'].dt.year.value_counts()

# Aufgabe 1 Feature Engineering

## EDA

In [None]:
# Amount of NaN per columnf
weather.isna().mean()


## Data wrangling

In [None]:
# Drop columns with missing target Variable "RainTomorrow"
weather = weather[weather['RainTomorrow'].notna()]

In [None]:
# Drop rows and columns with NaN propotion more than 70%

weather = weather[weather.columns[weather.isnull().mean() < 0.7]]
weather = weather.loc[weather.isnull().mean(axis=1) < 0.7]

In [None]:
def encode_yes_no(data):
    if data == 'Yes':
        return 1
    if data == 'No':
        return 0


weather['RainToday'] = weather["RainToday"].apply(encode_yes_no)
weather['RainTomorrow'] = weather["RainTomorrow"].apply(encode_yes_no)

In [None]:
weather['Year'] = weather['Date'].dt.year  # get year
weather['Month'] = weather['Date'].dt.month  # get month
weather['Day'] = weather['Date'].dt.day  # get day
weather.drop(labels=['Date', 'Location'], axis=1, inplace=True)

In [None]:
def encode_season(month):
    if month >= 9 and month <= 11:
        return 'Spring'
    if month == 12 or month <= 2:
        return 'Summer'
    if month >= 3 and month <= 5:
        return 'Autumn'
    if month >= 6 and month <= 8:
        return 'Winter'
    
weather['Season'] = weather['Month'].apply(encode_season)

**Important!** 

Before starting Feature Engineering one must split the dataset to ovoid test train leakage!
All Decisions in Data Engineering must be made on the Train Set only! From here, we assume that we dont have any 
knowledge about the test data.

In [None]:
X = weather.drop(['RainTomorrow'], axis=1)
y = weather['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# Impute values the naiive approache without considering the locations or other stuff like season

for dataset in [X_train, X_test]:

    colums_containing_nan = dataset.columns[dataset.isnull().any()]
    
    numerical_containing_nan = [col for col in colums_containing_nan if dataset[col].dtypes != 'O']
    categorial_containing_nan = [col for col in colums_containing_nan if dataset[col].dtypes == 'O']

    for col in numerical_containing_nan:
        col_median=X_train[col].median() #always use median from Train data ! Never impute based on Test Data ! we have to assume we dont know it.
        dataset[col] = dataset[col].fillna(col_median) 
        
    for col in categorial_containing_nan:
        col_most_occuring = X_train[col].mode()[0]
        dataset[col] = dataset[col].fillna(col_most_occuring)     
        

In [None]:
# apply One Hot encoding

for col in ["WindGustDir", "WindDir9am", "WindDir3pm", "Season"]:
    encoded_columns = pd.get_dummies(X_train[col], prefix=col, drop_first=True)
    X_train = X_train.join(encoded_columns).drop(col, axis=1)
    
    encoded_columns = pd.get_dummies(X_test[col], prefix=col, drop_first=True)
    X_test = X_test.join(encoded_columns).drop(col, axis=1)

In [None]:
# selection was made based on Permutation Importance output below
select = ['Humidity3pm', 'Pressure3pm', 'Rainfall', 'Season_Winter']
X_train = X_train[select]
X_test = X_test[select]

# Aufgabe 2 Entscheidungsbäume

In [None]:
model = tree.DecisionTreeClassifier(max_depth=5)

model.fit(X_train, y_train)

In [None]:
X_train

In [None]:
test_predictions = model.predict(X_test).round().astype(int)
print(accuracy_score(y_test, test_predictions))
mean_absolute_error(y_test, test_predictions)

## Analysis

In [None]:
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

# Booklet 2 Neuronale Netze Aufgabe 2

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
train_dataset = train_dataset.shuffle(len(X_train)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))
test_dataset = test_dataset.shuffle(len(X_test)).batch(len(X_test))

In [None]:
nn = tf.keras.Sequential([
    keras.layers.Dense(2, activation='relu'),
    keras.layers.Dense(1)
])

In [None]:
nn.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
nn.fit(train_dataset, 
       validation_steps=1,
       validation_data=test_dataset,
       epochs=10)