## Imports

In [34]:
import pandas as pd
import os

gen_dirname = os.path.dirname(os.path.abspath(''))

## EDA

Only labelled train file will be used to train our model. The other file could be used for inference test to show the capacities of the model.

In [35]:
labelled_train = pd.read_csv(os.path.join(gen_dirname,r"data\labelled_train.csv"))
unlabelled_test = pd.read_csv(os.path.join(gen_dirname,r"data\unlabelled_test.csv"))

In [36]:
labelled_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [37]:
labelled_train.info()
unlabelled_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

### Cleaning data

Some columns need to be dropped because they do not have any relation with other variable, have lot of nans that are not easy to process, are not easily mutable into numbers/categories...

In [38]:
features_to_drop_rough = ["PassengerId","Name","Ticket","Cabin","Age","Embarked"]
features_to_drop_gentle = ["PassengerId","Name","Ticket","Cabin","Embarked"]

#### Rough data processing

In [39]:
train_rough = labelled_train.drop(features_to_drop_rough,axis="columns")
test_rough = unlabelled_test.drop(features_to_drop_rough,axis="columns")

In [40]:
test_rough.loc[unlabelled_test["Fare"].isnull()]

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare
152,3,male,0,0,


Arbitrary choice to be able to use Fare column as well for test set

In [41]:
test_rough["Fare"] = test_rough["Fare"].fillna(unlabelled_test["Fare"].mean())

#### Gentle data processing

In [42]:
train_gentle = labelled_train.drop(features_to_drop_gentle,axis="columns")
test_gentle = unlabelled_test.drop(features_to_drop_gentle,axis="columns")

In [43]:
train_gentle['Age'] = train_gentle['Age'].fillna(train_gentle.groupby(["Pclass","Sex"])['Age'].transform('mean'))
test_gentle['Age'] = test_gentle['Age'].fillna(test_gentle.groupby(["Pclass","Sex"])['Age'].transform('mean'))
    

In [44]:
test_gentle["Fare"] = test_gentle["Fare"].fillna(unlabelled_test["Fare"].mean())

### Binarizing sex

In [45]:
def binarize_sex(x):
    if x == "male":
        return 0
    else:
        return 1

In [46]:
train_gentle["Sex"] = train_gentle["Sex"].apply(binarize_sex)
test_gentle["Sex"] = test_gentle["Sex"].apply(binarize_sex)
train_rough["Sex"] = train_rough["Sex"].apply(binarize_sex)
test_rough["Sex"] = test_rough["Sex"].apply(binarize_sex)

### Saving data

In [47]:
save_folder = os.path.join(gen_dirname,r"data\rough")
os.makedirs(save_folder,exist_ok = True)
train_rough.to_csv(os.path.join(save_folder,"labelled.csv"),index=False)
test_rough.to_csv(os.path.join(save_folder,"unlabelled.csv"),index=False)

In [48]:
save_folder = os.path.join(gen_dirname,r"data\gentle")
os.makedirs(save_folder,exist_ok = True)
train_gentle.to_csv(os.path.join(save_folder,"labelled.csv"),index=False)
test_gentle.to_csv(os.path.join(save_folder,"unlabelled.csv"),index=False)