# Model Performance Transformations

Lets practice some basic data transformation for ML performance enhancement

In [None]:
# Imports

import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [None]:
# Categorical data analyser
def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

In [None]:
# Netflix dataset

netflix = pd.read_csv('../data/netflix_titles.csv')
col_netflix = list(netflix_titles.columns)
print(netflix.shape)
netflix.head()

In [None]:
cat_netflix = cat_var(netflix, col_netflix)
cat_netflix

In [None]:
num_netflix = netflix.describe()
num_netflix

In [None]:
# Mushrooms dataset

mushrooms = pd.read_csv('../data/mushrooms.csv')
col_mushrooms = list(mushrooms.columns)
print(mushrooms.shape)
mushrooms.head()

In [None]:
cat_mushrooms = cat_var(mushrooms, col_mushrooms)
cat_mushrooms

In [None]:
num_mushrooms = mushrooms.describe()
num_mushrooms

In [None]:
# Weather dataset

weather = pd.read_csv('../data/weatherAUS.csv')
col_weather = list(weather.columns)
print(weather.shape)
weather.head()

In [None]:
cat_weather = cat_var(weather, col_weather)
cat_weather

In [None]:
num_weather = weather.describe()
num_weather

---

## Outliers

Search for outliers within categorical and numerical data. You may use any tool you find convenient. After the analysis you must decide what to do about the outliers.

![Image](../images/outliers.png)

In [None]:
# Netflix outlier analysis










In [None]:
# Mushrooms outlier analysis










In [None]:
# Weather outlier analysis










---

## Scaling

Some ML algorithms have problems performing well whenever the data scale differ greatly between features. In those cases scaling the data is your best option.

- [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler)

- [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler)

![Image](../images/scaling.png)

In [None]:
# Netflix numerical data scaling










In [None]:
# Mushrooms numerical data scaling










In [None]:
# Weather numerical data scaling










---

## Enconding

ML algorithms do not support categorical data. Therefore you need to find a way to transform categorical data into numerical. You can use either __One Hot Encoding__ or __Label Encoding__...choose wisely!!!

- [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)

- [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder)

![Image](../images/encoding.png)

In [None]:
# Netflix categorical data encoding










In [None]:
# Mushrooms categorical data encoding










In [None]:
# Weather categorical data encoding








