In [1]:
! pip install tensorflow_data_validation



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_data_validation as tfdv

In [16]:
train = pd.read_csv("https://gist.githubusercontent.com/sdukshis/c4fa70ed0bd9468f6401ab8dc1e36f8d/raw/e62762bbb28d67b72ad1c4819b65b2fc67ae4b12/train.csv")
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [17]:
test = pd.read_csv("https://gist.githubusercontent.com/sdukshis/b69647ddf6b694edda41668de2edbe41/raw/69b835e48670f8648925f8d1312b9f278b56deea/test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Быстрая проверка стат свойств train и test-наборов данных

In [5]:
train = tfdv.generate_statistics_from_dataframe(train)

In [6]:
tfdv.visualize_statistics(train)

In [7]:
test = tfdv.generate_statistics_from_dataframe(test)

In [8]:
tfdv.visualize_statistics(lhs_statistics = test,
                          rhs_statistics = train,
                          lhs_name='TEST_DATASET', rhs_name='TRAIN_DATASET')

# Валидация данных

Пример на основе набора данных об пассажирах Титаника - https://www.kaggle.com/c/titanic

## Построение схемы данных

Разделим признаки на следующие категории:
* числовые
* категориальные
* текстовые


In [9]:
num_feat = [
    "Age",
    "SibSp",
    "Parch",
    "Fare",
]

cat_feat = [
    "Pclass",
    "Sex",
    "Embarked",
]

txt_feat = [
    "Name",
    "Ticket",
    "Cabin",
]



Создадим класс для работы со схемой

In [22]:
train = pd.read_csv("https://gist.githubusercontent.com/sdukshis/c4fa70ed0bd9468f6401ab8dc1e36f8d/raw/e62762bbb28d67b72ad1c4819b65b2fc67ae4b12/train.csv")
test = pd.read_csv("https://gist.githubusercontent.com/sdukshis/b69647ddf6b694edda41668de2edbe41/raw/69b835e48670f8648925f8d1312b9f278b56deea/test.csv")

In [21]:
from typing import List
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


class Schema:

    def __init__(self, num_feat: List[str], cat_feat: List[str], txt_feat: List[str]):
        self._num_feat = num_feat
        self._cat_feat = cat_feat
        self._txt_feat = txt_feat
        self._all_feat = set(self._num_feat + self._cat_feat + self._txt_feat)

    def fit(self, df: pd.DataFrame) -> "Schema":
        self._cat_classes_ = {
            feat: set(df[feat].unique())
            for feat in self._cat_feat
        }

        self._num_minmax = MinMaxScaler().fit(df[self._num_feat])

        return self

    def validate(self, df: pd.DataFrame) -> None:
        self._validate_feat(df)
        self._validate_cat(df)
        self._validate_num(df)

    def _validate_feat(self, df: pd.DataFrame) -> None:
        missing_feat = self._all_feat - set(df.columns)
        if missing_feat:
          raise ValueError(f"Missing {missing_feat} features")

    def _validate_cat(self, df: pd.DataFrame) -> None:
        new_values = dict()
        for feat, classes_ in self._cat_classes_.items():
            new_cat_values = set(df[feat].unique()) - classes_
            if new_cat_values:
              new_values[feat] = new_cat_values

        if new_values:
          raise ValueError(f"Following categorical features includes a new values: {new_values}")


    def _validate_num(self, df:pd.DataFrame) -> None:
        out_of_range = list()
        for i, feat in enumerate(self._num_minmax.feature_names_in_):
            if any((df[feat] < self._num_minmax.data_min_[i]) |
                   (df[feat] > self._num_minmax.data_max_[i])):
                out_of_range.append(feat)

        if out_of_range:
            raise ValueError(f"Following numerical features has out of range values: {out_of_range}")


titanic_schema = Schema(num_feat, cat_feat, txt_feat).fit(train)

In [23]:
titanic_schema.validate(test.drop(["Sex", "Name"], axis=1))

ValueError: Missing {'Name', 'Sex'} features

In [24]:
titanic_schema.validate(test)

ValueError: Following numerical features has out of range values: ['Age', 'Parch']

In [25]:
example_input = test.iloc[0:1, :].copy()

example_input["Embarked"] = "X"

titanic_schema.validate(example_input)

ValueError: Following categorical features includes a new values: {'Embarked': {'X'}}

In [26]:
titanic_schema.validate(test)

ValueError: Following numerical features has out of range values: ['Age', 'Parch']

## Выбросы в числовых признаках

In [27]:
from typing import Tuple

class NumericFeatureOutlierTransformer:

    def __init__(self, quan_range: Tuple[float] = (0.25, 0.75), strategy: str = "clip"):
        self._fitted = False
        self._quan_range = quan_range
        self.transform = {
            "clip": self._clip,
            "nan": self._nan,
        }[strategy]

    def fit(self, x: np.ndarray) -> "NumericFeatureOutlierTransformer":
        lower_quan = np.nanquantile(x, self._quan_range[0])
        upper_quan = np.nanquantile(x, self._quan_range[1])

        iqr = upper_quan - lower_quan
        self._lower_extreme = lower_quan - 1.5*iqr
        self._upper_extreme = upper_quan + 1.5*iqr

        self._fitted = True
        return self

    def _clip(self, x: np.ndarray) -> np.ndarray:
        return x.clip(self._lower_extreme, self._upper_extreme)

    def _nan(self, x:np.ndarray) -> np.ndarray:
        raise NotImplemented

In [28]:
age_transformer = NumericFeatureOutlierTransformer().fit(train["Age"])

In [29]:
test["Age_clipped"] = age_transformer.transform(test["Age"])

In [30]:
test[test["Age"].notna() & (test["Age_clipped"] != test["Age"])][["Age", "Age_clipped"]]

Unnamed: 0,Age,Age_clipped
81,67.0,64.8125
96,76.0,64.8125


## Заполнение пропусков

In [31]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [33]:
from sklearn.impute import SimpleImputer

age_simple_imputer = SimpleImputer(strategy="mean").fit(train[["Age"]])


In [34]:
train["Age_simple_imputed"] = age_simple_imputer.transform(train[["Age"]])

Заполняем пропущенный возраст в зависимости от обращения

In [36]:
train['Title'] = train['Name'].str.extract('([A-Za-z]+)\.', expand=True)
train['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Title, dtype: int64

Заменим редкие обращения

In [37]:
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr',
           'Don': 'Mr', 'Mme': 'Mrs', 'Jonkheer': 'Mr', 'Lady': 'Mrs',
           'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
train.replace({'Title': mapping}, inplace=True)
train['Title'].value_counts()

Mr        525
Miss      185
Mrs       128
Master     40
Dr          7
Rev         6
Name: Title, dtype: int64

In [38]:
title_ages = dict(train.groupby('Title')['Age'].median())
title_ages

{'Dr': 46.5, 'Master': 3.5, 'Miss': 21.0, 'Mr': 30.0, 'Mrs': 35.0, 'Rev': 46.5}

In [39]:
train['age_med'] = train['Title'].apply(lambda x: title_ages[x])


train["Age_imputed" ] = train['Age'].fillna(train['age_med'])