In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research')):
    os.chdir("..")
sys.path.append('src')
os.getcwd()

In [118]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import (
    FunctionTransformer,
)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split


from src.custom_transformers import (
    DropColumnTransformer,
    CustomImputer,
    CustomStandardScaler,
    CustomLabelEncoder,
    CustomOneHotEncoder,
)

In [None]:
# Load Titanic dataset from seaborn
raw_data = sns.load_dataset("titanic")
raw_data.head()

In [120]:
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import stats

class OutlierRemoveTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3):
        self.threshold = threshold

    def fit(self, X, y=None):
        self.numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
        self.means = X[self.numerical_cols].mean()
        self.stds = X[self.numerical_cols].std()
        return self

    def transform(self, X):
        X_num = X[self.numerical_cols]
        z_scores = ((X_num - self.means) / self.stds).abs()
        outliers = (z_scores > self.threshold).any(axis=1)
        return X[~outliers]

In [None]:
data_cleaning = make_pipeline(
    DropColumnTransformer(columns=["deck"]),
    CustomImputer(strategy="mean", columns=["age"]),
    CustomImputer(strategy="most_frequent", columns=["embarked"]),
    FunctionTransformer(lambda X: X.drop_duplicates(), validate=False),
    OutlierRemoveTransformer(threshold=3)
)

df_cleaned = data_cleaning.fit_transform(raw_data)
df_cleaned.head()

In [122]:
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned.drop(columns=["alive", "survived"]),
    df_cleaned["alive"],
    test_size=0.2,
    random_state=42,
)

In [None]:
preprocessing_pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"])
)
df = preprocessing_pipeline.fit_transform(X_train, y_train)
df.head()

*NOTE*

You should never use test dataset for exploratory analysis. Otherwise you would have no place to check you observations 

# EDA 

Exploratory Data Analysis is an initial step when working with new dataset 

## Data Statistics

We start by providing some general properties of our dataset. It is also good to provide some on raw_data we read from files

In [None]:
def sample_statistics(df: pd.DataFrame, y: pd.Series):

    return {
        "describe": df.describe(),
        "y": y.value_counts(dropna=False) / len(y),
        "datatypes": df.info(verbose=True),
    }


print(f'Raw: {sample_statistics(raw_data.drop(columns=["alive"]), raw_data["alive"])}')
print(f"Preprocessed {sample_statistics(df, y_train)}")

In [None]:
def variability(df):
    # Compute the variance of each column
    variances = df.var()

    # Compute the mean value of each column
    means = df.mean()

    # Scale the variance by the mean value of each feature
    scaled_variances = variances / means

    # Create a DataFrame to store the scaled variances
    variability_df = pd.DataFrame(
        {"Feature": scaled_variances.index, "Variability": scaled_variances.values}
    )

    return variability_df


variability(df)

### Exercise 1

Write a function that would do following things:
* detect column with nulls
* detect constant columns
* detect columns with unique values but also ensuring column is not floating point value. 
It is a good idea to return results as dictionary with keys like 'null_cols', 'const_cols', etc and values being lists with string-typed column names.

In [None]:
def detect_columns(df: pd.DataFrame) -> dict:
    null_cols = df.columns[df.isnull().any()].tolist()
    
    const_cols = [col for col in df.columns if len(df[col].unique())==1]
    
    unique_cols = [col for col in df.columns if df[col].nunique() == len(df) and not pd.api.types.is_float_dtype(df[col])]
    
    return {
        "null_cols": null_cols,
        "const_cols": const_cols,
        "unique_cols": unique_cols,
    }

print(detect_columns(df))

# Correlations

In [None]:
# Calculate correlation matrix
correlation_matrix = df.corr()

# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Titanic Dataset")
plt.show()

### Exercise 2

Write a code that instead of displaying correlation matrix would store results in dataframe, possibly sroted by absolute values of correlations. It would be best to return a dataframe with correlation results

In [None]:
def sorted_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:

    correlation_matrix = df.corr()

    correlation_series = correlation_matrix.unstack()

    correlation_df = correlation_series.reset_index()
    correlation_df.columns = ['Feature_1', 'Feature_2', 'Correlation']

    correlation_df['Abs_Correlation'] = correlation_df['Correlation'].abs()
    sorted_correlation_df = correlation_df.sort_values(by='Abs_Correlation', ascending=False)

    return sorted_correlation_df

sorted_correlation_df = sorted_correlation_matrix(df)
sorted_correlation_df.head()

# Visualize

In [None]:
df_combined = pd.concat([df, y_train], axis=1)
df_combined.head()

In [None]:
# Create a pairplot with hue set to the target variable
sns.pairplot(df_combined, hue="alive")
plt.show()

### Exercise 3

The plot is hard to read. Write a code to split into two pieces. Mind so the the both plots would contain target column

In [None]:
features_group_1 = list(df.columns[:len(df.columns) // 2])
features_group_2 = list(df.columns[len(df.columns) // 2:])

sns.pairplot(df_combined, vars=features_group_1, hue="alive")
plt.show()

sns.pairplot(df_combined, vars=features_group_2, hue="alive")
plt.show()

## Exercise 4

Write a code to plot histograms separately. Its is a good idea to keep them separated by colour though.

In [None]:
for column in df.columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(data=df_combined, x=column, kde=True, hue="alive", multiple="layer")
    plt.title(f"Histogram of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.show()

# Feature selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Initialize Random Forest classifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform feature selection using feature importance
selector = SelectFromModel(clf)
selector.fit(df, y_train)

# Get selected feature indices
selected_features_indices = selector.get_support(indices=True)

# Get selected feature names
selected_features = df.columns[selected_features_indices]

# Create DataFrame with selected features
df_selected = df[selected_features]

# Display selected features
print("Selected Features:")
print(df_selected.head())

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression model with Lasso regularization
logistic_lasso = LogisticRegression(penalty="l1", solver="liblinear", random_state=42)

# Fit the model to the training data
logistic_lasso.fit(df, y_train)

# Get coefficients of the model
coefficients = logistic_lasso.coef_

# Get indices of non-zero coefficients (i.e., selected features)
selected_feature_indices = (coefficients != 0).flatten()

# Get selected feature names
selected_features = df.columns[selected_feature_indices]

# Create DataFrame with selected features
df_selected = df[selected_features]

# Display selected features
print("Selected Features:")
print(df_selected.head())

### Exercise 5 

Write couple of statistical test to explore its properties 

In [None]:
from scipy.stats import ttest_ind, chi2_contingency

# T-test for comparing the means of fare between alive and not alive
alive_yes = df_combined[df_combined['alive'] == 'yes']['fare']
alive_no = df_combined[df_combined['alive'] == 'no']['fare']
t_stat, p_value = ttest_ind(alive_yes, alive_no, nan_policy='omit')
if p_value < 0.05:
    print("There is a significant difference in the mean of fare between alive and not alive passangers.")
else:
    print("There is no significant difference in the mean fare between alive and not alive passangers.")

# Chi-square test for independence between pclass and alive
contingency_table = pd.crosstab(df_combined['pclass'], df_combined['alive'])
chi2_stat, chi2_p_value, _, _ = chi2_contingency(contingency_table)
if chi2_p_value < 0.05:
    print("There is a significant association between pclass and alive")
else:
    print("There is no significant association between pclass and alive")

### Exercise 6 

Plot each feature in function of index. This would let you observe whether variance in feature is uniformly distributed (So the data ain't ordered by anything)

In [None]:
for column in df.columns:
    plt.scatter(df.index, df[column], alpha=0.3)
    plt.title(f"{column} vs Index")
    plt.xlabel("Index")
    plt.ylabel(column)
    plt.show()

# Pass-Fail Exercise 

Complete the exercises presented in this notebook. Then copy this notebook to your student directory and create a Merge request with it. Please do not commit thios file.