# Data cleaning

## 1.1 Redundant Features

In [None]:
#Identify number of unique values for each column
from numpy import unique
for i in range(data.shape[1]):
    print(i , len (unique(data[:, i])))
counts = MyDataFrame.nunique()

In [None]:
#Remove columns with zero varance
from sklearn.feature_selection import VarianceThreshold
transform = VarianceThreshold(threshold=0)
# transform the input data
MyData_selected= transform.fit_transform(MyData)

## 1.2 Redundant Samples

In [None]:
#remove duplicate rows
# calculate duplicates
dups = MyDataFrame.duplicated()
# report if there are any duplicates
print(dups.any())
# list all duplicate rows
print(MyDataFrame[dups])

## 2. Missing values

## 2.1 Missing Values: Imputation using mean, median, most_frequent, constant

In [1]:
# count the number of nan values for each column 

MyDataFrame.isna(axis=0).sum()

# count the number of nan values for each row 

MyDataFrame.isna(axis=1).sum()


NameError: name 'MyDataFrame' is not defined

## 2.2 Missing Values: Imputation. KNNImputer

In [None]:
# Imputation strategies = ['mean', 'median', 'most_frequent', 'constant']
from sklearn.impute import SimpleImputer

# define imputer
imputer = SimpleImputer(strategy='mean')

# fit on the dataset
imputer.fit(X)

# transform the dataset
Xtrans = imputer.transform(X)


In [None]:
from sklearn.impute import KNNImputer

# define imputer
imputer = KNNImputer()
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

# fit on the dataset
imputer.fit(X)

# transform the dataset
Xtrans = imputer.transform(X)


## 3. Outliers

## 3.1 Outlier identification using standard deviation

In [None]:
from numpy import mean
from numpy import std
data_mean, data_std = mean(MyData), std(MyData)

# define outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off

# identify outliers
outliers = [x for x in data if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))

# remove outliers
outliers_removed = [x for x in data if x >= lower and x <= upper]


## 3.2 Outlier identification using Inter Quartile Range IQR

In [None]:
from numpy import percentile

# calculate interquartile range
q25, q75 = percentile(MyData, 25), percentile(MyData, 75)
IQR = q75 - q25

# calculate the outlier cutoff
cut_off = IQR * 1.5

lower, upper = q25 - cut_off, q75 + cut_off

# identify outliers
outliers = [x for x in data if x < lower or x > upper]

# remove outliers
outliers_removed = [x for x in data if x >= lower and x <= upper]


## 3.3 Outliers identification: automatic identification LocalOutlierFactor

In [None]:
from sklearn.neighbors import LocalOutlierFactor
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yp = lof.fit_predict(MyDataX_training)

# select all rows that are not outliers
mask = yp != -1
X_train, y_train = MyDataX_train[mask, :], MyDatay_train[mask]


# Data Transformation

## 1. Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
# define the scaler 
scaler = MinMaxScaler()

# fit on the training dataset 
scaler.fit(X_train) 

# scale the training dataset 
X_train = scaler.transform(X_train) 

# scale the test dataset 
X_test = scaler.transform(X_test)


## 2. Standartization

In [None]:
from sklearn.preprocessing import StandardScaler
# define standard scaler
scaler = StandardScaler()

# transform data
scaled = scaler.fit_transform(data)


## 3. Robust Standartization

Robust standardization for data with outliers.

In [None]:
from sklearn.preprocessing import RobustScaler
trans = RobustScaler()
MyData_scaled = trans.fit_transform(MyData)


## 4. PowerTransform to make data more Gaussian like

In [None]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
pt.fit(MyData)
pt.transform(MyData)

print(pt.lambdas_)

 


# Data Preparation

1. Split Data:
1.1 Train and test Sets
1.2 k-fold Cross-Validation

2. Fit Data Preparation on Training Dataset. 

3. Apply Data Preparation to Train and Test Datasets. 

4. Evaluate Models.

## Data split. Training and testing data

In [None]:
from sklearn.model_selection import train_test_split


# split into train and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)


## Data split. k-fold cross validation

In [None]:

from sklearn.model_selection import RepeatedStratifiedKFold

# Repeats Stratified K-Fold n times with different randomization in each repetition. 
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20, random_state=1) 

