In [1]:
# Built-in imports
import warnings
from typing import List, Tuple, Dict

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('../data/train.csv')
num = df.shape

In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# Preprocessing
df['GroupId'] = df.PassengerId.apply(lambda x: x.split('_')[0])
df['PassengerId'] = df.PassengerId.apply(lambda x: x.split('_')[1])
df.GroupId = df.GroupId.apply(lambda x: int(x))

In [5]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2
2,1,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3
3,2,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3
4,1,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4


In [6]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0,8693.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791,4633.389624
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189,2671.028856
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,2319.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,4630.0
75%,38.0,47.0,76.0,27.0,59.0,46.0,6883.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,9280.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  GroupId       8693 non-null   int64  
dtypes: bool(1), float64(6), int64(1), object(7)
memory usage: 959.4+ KB


In [8]:
# Breaking Cabin into deck, num and side
decks, nums, sides = [], [], []
lists = [decks, nums, sides]
for _, row in df.iterrows():
    if pd.isna(row.Cabin):
        for l in lists:
            l.append(np.nan)
    else:
        x = row.Cabin.split('/')
        for i, l in enumerate(lists):
            l.append(x[i])

df['Deck'] = decks
df['Num'] = nums
df['Side'] = sides


# Typecasting num values to int wherever possible, else keeping nan
df['Num'] = df.Num.apply(lambda x: int(x) if not pd.isna(x) else np.nan)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  GroupId       8693 non-null   int64  
 15  Deck          8494 non-null   object 
 16  Num           8494 non-null   float64
 17  Side          8494 non-null   object 
dtypes: bool(1), float64(7), int6

In [10]:
def get_last_name(name: str) -> str:
    if pd.isna(name): return ''
    else: return name.split(' ')[1]

df['LastName'] = df.Name.apply(lambda x: get_last_name(x))

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  GroupId       8693 non-null   int64  
 15  Deck          8494 non-null   object 
 16  Num           8494 non-null   float64
 17  Side          8494 non-null   object 
 18  LastName      8693 non-null 

In [13]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,Deck,Num,Side,LastName
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,B,0.0,P,Ofracculy
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,F,0.0,S,Vines
2,1,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,A,0.0,S,Susent
3,2,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,A,0.0,S,Susent
4,1,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,F,1.0,S,Santantines


In [14]:
import pandas as pd
from typing import List, Dict, Any
from sklearn.preprocessing import LabelEncoder


def get_encoder_dictionary(df: pd.DataFrame, encode_cols: List[str], **kwargs) -> Dict[str, LabelEncoder]:
    encoders: Dict[str, LabelEncoder] = {}
    
    # For every column we're fitting the encoder
    # with all non null values and save it in the
    # encoders dictionary with the key `column`
    for column in encode_cols:
        l = LabelEncoder(**kwargs)
        non_null_values: List[Any] = df[~pd.isna(df[column])][column].tolist()
        l.fit(non_null_values)
        encoders[column] = l
    
    return encoders

In [15]:
encoder_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'Deck', 'Side', 'LastName']

encoders = get_encoder_dictionary(df, encoder_cols)

#### For Classification problem

In [16]:
def impute_with_classification(
    df: pd.DataFrame,
    target_attribute: str,
    exclude_cols: List[str],
    encoders: Dict[str, LabelEncoder],
    test: bool = True
) -> pd.DataFrame:
    
    # Exit condition
    if df[pd.isna(df[target_attribute])].shape[0] == 0:
        return df
    
    model = DecisionTreeClassifier()
    # Columns that're to be used for this model
    include_cols = [x for x in df.columns if x not in exclude_cols]
    print(include_cols)
    
    # Making a copy of the dataframe
    temp = df[include_cols].copy(deep=True)
    temp.dropna(inplace=True)
    
    for c in temp.columns:
        if c in encoders:
            temp[c] = encoders[c].transform(temp[c])
    
    X, y = (temp[[x for x in df.columns if x not in exclude_cols + [target_attribute]]], 
            temp[target_attribute])

    if test:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
        model.fit(X_train, y_train)
        # Scores n stuff
        preds = model.predict(X_test)
        plt.figure(figsize=(7, 5))
        sns.heatmap(confusion_matrix(y_test, preds), annot=True, fmt='.0f')
        plt.show()
        print('\t\tCLASSIFICATION REPORT OF DTREE!\n', '\t\t', '~'*40)
        print(classification_report(y_test, preds))
    
    # Final training
    model.fit(X, y)
    # Processing the dataframe for prediction
    X_new = df[pd.isna(df[target_attribute])]
    # Selecting only those rows
    X_new = X_new[X.columns]
    old_shape = X_new.shape[0]
    X_new.dropna(inplace=True)
    print('Had to drop {} rows'.format(old_shape - X_new.shape[0]))
    
    # Encoding values for prediction
    for column, encoder in encoders.items():
        if column in X_new.columns:
            X_new[column] = encoder.transform(X_new[column])
    predictions = model.predict(X_new)
    labels = encoders[target_attribute].inverse_transform(predictions)
    index_prediction_mapping: List[Tuple[int, str]] = list(zip(X_new.index.tolist(), labels))
    for idx, p in index_prediction_mapping:
        df.loc[df.index==idx, target_attribute] = p
    
    # For the rest of the missing attributes we're imputing with mode
    df.loc[pd.isna(df[target_attribute]), target_attribute] = df[target_attribute].mode().iloc[0]
    return df

In [17]:
exclude_cols = ['PassengerId', 'Cabin', 'Name', 'Transported']
target_attr = 'HomePlanet'

t = impute_with_classification(df, target_attr, exclude_cols, encoders, test=False)

['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupId', 'Deck', 'Num', 'Side', 'LastName']
Had to drop 33 rows


In [28]:
t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  GroupId       8693 non-null   int64  
 15  Deck          8494 non-null   object 
 16  Num           8494 non-null   float64
 17  Side          8494 non-null   object 
 18  LastName      8693 non-null 

#### For Regression

In [34]:
from sklearn.linear_model import LinearRegression

def impute_with_regression(
    df: pd.DataFrame,
    target_attribute: str,
    exclude_cols: List[str],
    encoders: Dict[str, LabelEncoder],
    test: bool = True
) -> pd.DataFrame:
    
    # Exit condition
    if df[pd.isna(df[target_attribute])].shape[0] == 0:
        return df
    
    model = LinearRegression()
    # Columns that're to be used for this model
    include_cols = [x for x in df.columns if x not in exclude_cols]
    print(include_cols)
    
    # Making a copy of the dataframe
    temp = df[include_cols].copy(deep=True)
    temp.dropna(inplace=True)
    
    for c in temp.columns:
        if c in encoders:
            temp[c] = encoders[c].transform(temp[c])
    
    X, y = (temp[[x for x in df.columns if x not in exclude_cols + [target_attribute]]], 
            temp[target_attribute])

    if test:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
        model.fit(X_train, y_train)
        # Scores n stuff
        preds = model.predict(X_test)
        model.score(y_test, preds)
        #plt.figure(figsize=(7, 5))
        #sns.heatmap(confusion_matrix(y_test, preds), annot=True, fmt='.0f')
        #plt.show()
        #print('\t\tCLASSIFICATION REPORT OF DTREE!\n', '\t\t', '~'*40)
        #print(classification_report(y_test, preds))
    
    # Final training
    model.fit(X, y)
    # Processing the dataframe for prediction
    X_new = df[pd.isna(df[target_attribute])]
    # Selecting only those rows
    X_new = X_new[X.columns]
    old_shape = X_new.shape[0]
    X_new.dropna(inplace=True)
    print('Had to drop {} rows'.format(old_shape - X_new.shape[0]))
    
    # Encoding values for prediction
    for column, encoder in encoders.items():
        if column in X_new.columns:
            X_new[column] = encoder.transform(X_new[column])
    predictions = model.predict(X_new)
    #labels = encoders[target_attribute].inverse_transform(predictions)
    index_prediction_mapping: List[Tuple[int, str]] = list(zip(X_new.index.tolist(), predictions))
    for idx, p in index_prediction_mapping:
        df.loc[df.index==idx, target_attribute] = p
    
    # For the rest of the missing attributes we're imputing with mode
    df.loc[pd.isna(df[target_attribute]), target_attribute] = df[target_attribute].median()
    return df

In [35]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,Deck,Num,Side,LastName
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,B,0.0,P,Ofracculy
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,F,0.0,S,Vines
2,1,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,A,0.0,S,Susent
3,2,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,A,0.0,S,Susent
4,1,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,F,1.0,S,Santantines


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8668 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  GroupId       8693 non-null   int64  
 15  Deck          8494 non-null   object 
 16  Num           8494 non-null   float64
 17  Side          8494 non-null   object 
 18  LastName      8693 non-null 

In [39]:
exclude_cols = ['PassengerId', 'Cabin', 'Name', 'Transported']
target_attr = 'RoomService'
t = impute_with_regression(df, target_attr, exclude_cols, encoders, test=False)
t.info()

In [42]:
exclude_cols = ['PassengerId', 'Cabin', 'Name', 'Transported']
target_attr = 'FoodCourt'
t = impute_with_regression(df, target_attr, exclude_cols, encoders, test=False)
t.info()

In [45]:
exclude_cols = ['PassengerId', 'Cabin', 'Name', 'Transported']
target_attr = 'ShoppingMall'
t = impute_with_regression(df, target_attr, exclude_cols, encoders, test=False)
t.info()

['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupId', 'Deck', 'Num', 'Side', 'LastName']
Had to drop 29 rows
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8668 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  GroupId       86

In [46]:
exclude_cols = ['PassengerId', 'Cabin', 'Name', 'Transported']
target_attr = 'Spa'
t = impute_with_regression(df, target_attr, exclude_cols, encoders, test=False)
t.info()

['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupId', 'Deck', 'Num', 'Side', 'LastName']
Had to drop 14 rows
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8668 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  GroupId       86

In [47]:

exclude_cols = ['PassengerId', 'Cabin', 'Name', 'Transported']
target_attr = 'VRDeck'
t = impute_with_regression(df, target_attr, exclude_cols, encoders, test=False)
t.info()

['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupId', 'Deck', 'Num', 'Side', 'LastName']
Had to drop 17 rows
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8668 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  GroupId       86