In [8]:
# Built-in imports
import warnings
from typing import List, Tuple, Dict

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
df = pd.read_csv('../data/train.csv')
num = df.shape

In [5]:
# Preprocessing
df['GroupId'] = df.PassengerId.apply(lambda x: x.split('_')[0])
df['PassengerId'] = df.PassengerId.apply(lambda x: x.split('_')[1])
df.GroupId = df.GroupId.apply(lambda x: int(x))


# Breaking Cabin into deck, num and side
decks, nums, sides = [], [], []
lists = [decks, nums, sides]
for _, row in df.iterrows():
    if pd.isna(row.Cabin):
        for l in lists:
            l.append(np.nan)
    else:
        x = row.Cabin.split('/')
        for i, l in enumerate(lists):
            l.append(x[i])

df['Deck'] = decks
df['Num'] = nums
df['Side'] = sides

# Typecasting num values to int wherever possible, else keeping nan
df['Num'] = df.Num.apply(lambda x: int(x) if not pd.isna(x) else np.nan)

def get_last_name(name: str) -> str:
    if pd.isna(name): return ''
    else: return name.split(' ')[1]

df['LastName'] = df.Name.apply(lambda x: get_last_name(x))

In [6]:
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,Deck,Num,Side,LastName
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,B,0.0,P,Ofracculy
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,F,0.0,S,Vines
2,1,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,A,0.0,S,Susent


In [9]:
import pandas as pd
from typing import List, Dict, Any
from sklearn.preprocessing import LabelEncoder


def get_encoder_dictionary(df: pd.DataFrame, encode_cols: List[str], **kwargs) -> Dict[str, LabelEncoder]:
    encoders: Dict[str, LabelEncoder] = {}
    
    # For every column we're fitting the encoder
    # with all non null values and save it in the
    # encoders dictionary with the key `column`
    for column in encode_cols:
        l = LabelEncoder(**kwargs)
        non_null_values: List[Any] = df[~pd.isna(df[column])][column].tolist()
        l.fit(non_null_values)
        encoders[column] = l
    
    return encoders

In [10]:
encoder_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'Deck', 'Side', 'LastName']

encoders = get_encoder_dictionary(df, encoder_cols)

In [48]:
def impute_with_classification(
    df: pd.DataFrame,
    target_attribute: str,
    exclude_cols: List[str],
    encoders: Dict[str, LabelEncoder],
    test: bool = True
) -> pd.DataFrame:
    
    # Exit condition
    if df[pd.isna(df[target_attribute])].shape[0] == 0:
        return df
    
    model = DecisionTreeClassifier()
    # Columns that're to be used for this model
    include_cols = [x for x in df.columns if x not in exclude_cols]
    print(include_cols)
    
    # Making a copy of the dataframe
    temp = df[include_cols].copy(deep=True)
    temp.dropna(inplace=True)
    
    for c in temp.columns:
        if c in encoders:
            temp[c] = encoders[c].transform(temp[c])
    
    X, y = (temp[[x for x in df.columns if x not in exclude_cols + [target_attribute]]], 
            temp[target_attribute])

    if test:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
        model.fit(X_train, y_train)
        # Scores n stuff
        preds = model.predict(X_test)
        plt.figure(figsize=(7, 5))
        sns.heatmap(confusion_matrix(y_test, preds), annot=True, fmt='.0f')
        plt.show()
        print('\t\tCLASSIFICATION REPORT OF DTREE!\n', '\t\t', '~'*40)
        print(classification_report(y_test, preds))
    
    # Final training
    model.fit(X, y)
    # Processing the dataframe for prediction
    X_new = df[pd.isna(df[target_attribute])]
    # Selecting only those rows
    X_new = X_new[X.columns]
    old_shape = X_new.shape[0]
    X_new.dropna(inplace=True)
    print('Had to drop {} rows'.format(old_shape - X_new.shape[0]))
    
    # Encoding values for prediction
    for column, encoder in encoders.items():
        if column in X_new.columns:
            X_new[column] = encoder.transform(X_new[column])
    predictions = model.predict(X_new)
    labels = encoders[target_attribute].inverse_transform(predictions)
    index_prediction_mapping: List[Tuple[int, str]] = list(zip(X_new.index.tolist(), labels))
    for idx, p in index_prediction_mapping:
        df.loc[df.index==idx, target_attribute] = p
    
    # For the rest of the missing attributes we're imputing with mode
    df.loc[pd.isna(df[target_attribute]), target_attribute] = df[target_attribute].mode().iloc[0]
    return df

In [49]:
exclude_cols = ['PassengerId', 'Cabin', 'Name', 'Transported']
target_attr = 'HomePlanet'

t = impute_with_classification(df, target_attr, exclude_cols, encoders, test=False)