![alt text](../../pythonexposed-high-resolution-logo-black.jpg "Optionele titel")

# Oefeningen voor het leren toepassen van compositie in Python voor data science

In [32]:
import pandas as pd
import numpy as np

## Enkele tips:

- Zorg ervoor dat elke klasse één verantwoordelijkheid heeft (Single Responsibility Principle).
- Gebruik compositie om klassen te combineren en functionaliteit modulair te houden.
- Voeg duidelijke documentatie en type hints toe aan je code.

### We werken op de weather dataset (weatherAUS.csv)
We willen deze dataset voorbereiden om beter bruikbaar te zijn voor Machine Learning.

### We willen daarom het volgende doen op deze dataset:
- Nullvalues vervangen door het gemiddelde van de kolom
- De Kolommen WindGustDir, WindDir9am, WindDir3pm geven een windrichting.  Dit is niet echt bruikbaar voor een ML model.  Zet deze om naar graden via de mapping en voeg 3 kolommen met de windrichting in graden toe.
- De kolom Date bevat datuminformatie.  Gebruik deze informatie om te bepalen welk seizoen het is.

Verken de dataset, de kolommen, de waarden, types enz... Maak vervolgens een modulaire preprocessing pipeline om de bovenstaande taken uit te voeren op de dataset.  Controleer of dit goed heeft gewerkt (controleer het aantal null values voor en na de transformatie)

In [51]:
# Inlezen van de dataset

weather = pd.read_csv('DATA/weatherAUS.csv')

In [52]:
weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [7]:
# aantal nullvalues per kolom:
weather.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

### Stap 1: Definieer de klassen voor elke cleaning-stap

In [2]:
from abc import ABC, abstractmethod
import pandas as pd

class DataProcessor(ABC):
    """Abstract Base Class voor dataprocessors."""

    @abstractmethod
    def fit(self, X: pd.DataFrame):
        """Leer informatie uit de dataset."""
        pass

    @abstractmethod
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Pas transformaties toe op de dataset."""
        pass

    def fit_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Combineer fit en transform in één stap."""
        self.fit(X)
        return self.transform(X)

In [3]:
# class om de lege waarden te vullen

class Opvullen(DataProcessor):
    pass

In [4]:
# class om de windrichting om te zetten naar graden

class ZetGradenOm(DataProcessor):
    pass

In [5]:
# class om het seizoen te bepalen

class SeizoenBepaler(DataProcessor):
   pass

### Stap 2: Maak een pipeline-klasse om de stappen te combineren

In [6]:
class DataCleaningPipeline:
    pass

### Stap 3: pas de pipeline toe op de data!

In [9]:
# # de kolommen met een windrichting: 
# winddirection_columns = ['WindGustDir', 'WindDir9am', 'WindDir3pm']

# datecolumns = ['Date']

# # Maak een pipeline
# pipeline = DataCleaningPipeline([
#     Opvullen(),
#     ZetGradenOm(winddirection_columns),
#     SeizoenBepaler(datecolumns)
# ])

# # Pas de pipeline toe
# cleaned_data = pipeline.fit_transform(weather)

### Stap 4: Check...

In [57]:
# aantal nullvalues per kolom in de aangepaste dataset:
cleaned_data.isnull().sum()

Date                         0
Location                     0
MinTemp                      0
MaxTemp                      0
Rainfall                     0
Evaporation                  0
Sunshine                     0
WindGustDir              10326
WindGustSpeed                0
WindDir9am               10566
WindDir3pm                4228
WindSpeed9am                 0
WindSpeed3pm                 0
Humidity9am                  0
Humidity3pm                  0
Pressure9am                  0
Pressure3pm                  0
Cloud9am                     0
Cloud3pm                     0
Temp9am                      0
Temp3pm                      0
RainToday                 3261
RainTomorrow              3267
WindGustDir_converted        0
WindDir9am_converted         0
WindDir3pm_converted         0
Date_season                 97
dtype: int64

In [58]:
# windrichtingen en seizoen testen

In [56]:
cleaned_data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,WindGustDir_converted,WindDir9am_converted,WindDir3pm_converted,Date_season
0,2008-12-01,Albury,13.4,22.9,0.6,5.468232,7.611178,W,44.0,W,...,8.0,4.50993,16.9,21.8,No,No,270.0,270.0,292.5,Zomer
1,2008-12-02,Albury,7.4,25.1,0.0,5.468232,7.611178,WNW,44.0,NNW,...,4.447461,4.50993,17.2,24.3,No,No,292.5,337.5,247.5,Zomer
2,2008-12-03,Albury,12.9,25.7,0.0,5.468232,7.611178,WSW,46.0,W,...,4.447461,2.0,21.0,23.2,No,No,247.5,270.0,247.5,Zomer
3,2008-12-04,Albury,9.2,28.0,0.0,5.468232,7.611178,NE,24.0,SE,...,4.447461,4.50993,18.1,26.5,No,No,45.0,135.0,90.0,Zomer
4,2008-12-05,Albury,17.5,32.3,1.0,5.468232,7.611178,W,41.0,ENE,...,7.0,8.0,17.8,29.7,No,No,270.0,67.5,315.0,Zomer


## Oefening 2: Featurization Workflow

Ontwerp een featurization pipeline voor data science-taken. Voeg de volgende stappen toe:

- **FeatureKwadraten:** Voeg nieuwe kolommen toe als kwadraten van bestaande features.
- **FeatureInteractie:** Voeg kolommen toe die het product van twee features zijn.

**Vereisten:**

1. Maak aparte klassen voor elke featurization-methode.
2. Combineer deze in een pipeline.
3. Pas dit toe op een dataset en print de getransformeerde features.

In [33]:
import pandas as pd
import numpy as np

# Stap 1: Definieer de klassen voor elke featurization-stap

class FeatureKwadraten:
    pass


class FeatureInteractie:
    pass


# Stap 2: Maak een pipeline-klasse om de stappen te combineren

class FeaturizationPipeline:
    pass


# Stap 3: Pas de pipeline toe op een dataset

# Voorbeeld dataset
np.random.seed(42)
data = pd.DataFrame({
    "feature1": np.random.randn(100),
    "feature2": np.random.randn(100),
    "feature3": np.random.randn(100)
})

# Maak een featurization pipeline
pipeline = FeaturizationPipeline([
    FeatureKwadraten(),
    FeatureInteractie()
])

# Pas de pipeline toe
transformed_data = pipeline.fit_transform(data)

# Bekijk de resultaten
transformed_data.head()

Unnamed: 0,feature1,feature2,feature3,feature1^2,feature2^2,feature3^2,feature1_x_feature2,feature1_x_feature3,feature1_x_feature1^2,feature1_x_feature2^2,...,feature2_x_feature3,feature2_x_feature1^2,feature2_x_feature2^2,feature2_x_feature3^2,feature3_x_feature1^2,feature3_x_feature2^2,feature3_x_feature3^2,feature1^2_x_feature2^2,feature1^2_x_feature3^2,feature2^2_x_feature3^2
0,0.496714,-1.415371,0.357787,0.246725,2.003274,0.128012,-0.703035,0.177718,0.122552,0.995055,...,-0.506402,-0.349207,-2.835376,-0.181184,0.088275,0.716746,0.045801,0.494258,0.031584,0.256443
1,-0.138264,-0.420645,0.560785,0.019117,0.176942,0.314479,0.05816,-0.077536,-0.002643,-0.024465,...,-0.235891,-0.008041,-0.07443,-0.132284,0.010721,0.099227,0.176355,0.003383,0.006012,0.055645
2,0.647689,-0.342715,1.083051,0.4195,0.117453,1.173,-0.221972,0.70148,0.271706,0.076073,...,-0.371177,-0.143769,-0.040253,-0.402004,0.45434,0.127208,1.270419,0.049272,0.492074,0.137773
3,1.52303,-0.802277,1.053802,2.31962,0.643649,1.110499,-1.221892,1.604972,3.53285,0.980296,...,-0.845441,-1.860978,-0.516385,-0.890928,2.44442,0.678278,1.170246,1.493021,2.575935,0.714771
4,-0.234153,-0.161286,-1.377669,0.054828,0.026013,1.897973,0.037766,0.322586,-0.012838,-0.006091,...,0.222198,-0.008843,-0.004196,-0.306116,-0.075535,-0.035837,-2.614779,0.001426,0.104062,0.049372


In [34]:
transformed_data.describe()

Unnamed: 0,feature1,feature2,feature3,feature1^2,feature2^2,feature3^2,feature1_x_feature2,feature1_x_feature3,feature1_x_feature1^2,feature1_x_feature2^2,...,feature2_x_feature3,feature2_x_feature1^2,feature2_x_feature2^2,feature2_x_feature3^2,feature3_x_feature1^2,feature3_x_feature2^2,feature3_x_feature3^2,feature1^2_x_feature2^2,feature1^2_x_feature3^2,feature2^2_x_feature3^2
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,-0.103847,0.022305,0.064896,0.827306,0.900887,1.168124,-0.119289,0.179304,-0.384815,-0.257251,...,-0.036053,0.284309,0.385906,0.057659,-0.106661,0.062202,0.446724,1.024781,0.9085893,0.865496
std,0.908168,0.953669,1.084283,1.154152,1.282345,2.048792,1.010326,0.9409,2.696539,2.116651,...,0.934305,1.654462,3.246662,1.743655,1.323124,1.330288,7.2343,3.760906,2.085153,1.895866
min,-2.619745,-1.918771,-3.241267,2.6e-05,0.000169,0.000105,-5.406524,-2.208408,-17.979479,-14.706659,...,-3.755318,-2.845628,-7.064307,-4.22613,-6.736157,-5.145396,-34.052152,8.859419e-08,3.783398e-07,7e-06
25%,-0.600906,-0.805661,-0.655444,0.090349,0.074929,0.196142,-0.360505,-0.246522,-0.21698,-0.222274,...,-0.427975,-0.115601,-0.523029,-0.323535,-0.224137,-0.198147,-0.281609,0.01791202,0.02453288,0.013764
50%,-0.126956,0.084107,0.097696,0.295362,0.510047,0.466403,-0.032338,0.031835,-0.002095,-0.006028,...,-0.05423,0.00533,0.000656,0.004126,0.002319,0.002937,0.001006,0.1027025,0.1581122,0.128714
75%,0.405952,0.53817,0.704437,1.076597,1.186619,1.211737,0.24926,0.441582,0.07041,0.187827,...,0.249037,0.254307,0.15717,0.252322,0.125979,0.230162,0.351695,0.5020486,0.5749278,0.859273
max,1.852278,2.720169,3.852731,6.863064,7.39932,14.84354,2.978689,3.585927,6.355045,5.618361,...,2.385298,10.745838,20.127403,12.17199,4.972179,5.224889,57.188174,29.2305,12.85888,14.102413
