# EDA Dataset Spaceship Titanic

## Descripción del Dataset

In [None]:
PassengerId is a unique identifier for each passenger. Each identifier has the form gggg_pp, where gggg indicates the group with which the passenger is traveling, and pp is their number in the group. People in a group are often family members, but not always.

HomePlanet - the planet from which the passenger departed, usually the planet of his permanent residence.

CryoSleep - indicates whether the passenger has decided to go into suspended animation for the duration of the flight. Passengers in cryosleep are locked in their cabins.

Cabin - the number of the cabin in which the passenger is located. Takes the form deck/num/side, where side can be either P for port or S for starboard.

Destination - the planet on which the passenger will disembark.

Age - the age of the passenger.

VIP - whether the passenger has paid for a special VIP service during the flight.

RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Room service, Food court, shopping center, Spa, VRDeck - the amount that a passenger paid for each of the many luxurious amenities of the Titanic spacecraft.

Name - the passenger's first and last name.

Transported - whether the passenger was transported to another dimension. This is the goal, the column that you are trying to predict.


In [2]:
# importación de librarías

import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Parámetros gráficos

params = {'figure.figsize' : (12,6),
         'font.size' : 12,
         'axes.titlesize' : 20,
         'axes.linewidth' : 2}


bbox = {'boxstyle' : 'round',
        'pad' : 0.5,
        'facecolor' : 'white',
        'edgecolor' : 'black',
        'linewidth' : 1,
        'alpha' : 1}

plt.style.use('fivethirtyeight')
color = ['#E36149', '#49AF72']
matplotlib.rcParams.update(params)

In [5]:
# Сountplot based on a one column

def countplot_one_column(name_column, title):
    c = sns.countplot(data = df,
                  x = name_column,
                  edgecolor = 'black',
                  linewidth = 1.5)
    plt.bar_label(c.containers[0], bbox = bbox)

    plt.title(title, fontweight = 'bold')
    plt.show()

In [6]:
# Сountplot based on multiple column

def countplot_hue_column(name_column, hue_column, title):
    c = sns.countplot(data = df,
                  x = name_column,
                  hue = hue_column,
                  linewidth = 1,
                  edgecolor = 'black',
                  palette = color)
    plt.bar_label(c.containers[0], bbox = bbox)
    plt.bar_label(c.containers[1], bbox = bbox)
    
    plt.title(title, fontweight = 'bold')
    plt.show()


In [16]:
# Conectando la base de datos

df = pd.read_csv('train.csv')
df.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [10]:
# Verificar líneas x Columnas
df.shape

(8693, 14)

In [19]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## Limpieza de los datos

In [22]:
# Eliminación de la columna Name
df.drop("Name", axis=1, inplace=True)

## Transformación de los datos

In [29]:
# preparando dos nuevas columnas a partir de PassegerId
Group = df['PassengerId'].str.split('_', expand=True)[0] 
NumberInGroup = df['PassengerId'].str.split('_', expand=True)[1]

# Renomeando PassagerId como Group
df['PassengerId'] = Group
df = df.rename(columns = {'PassengerId' : 'Group'})
# removing the extra 0
df['Group'] = df['Group'].astype('int')

# add NumberInGroup
insert_position = 1
df.insert(insert_position, 'NumberInGroup', NumberInGroup)
# removing the extra 1
df['NumberInGroup'] = df['NumberInGroup'].astype('int')



In [33]:
# crea nuevo dataset llamado count_group
count_group = df.groupby('Group').agg({'Group' : 'count'})
count_group.index.name = 'Group'
count_group = count_group.rename(columns = {'Group' : 'CountGroup'})

count_group.head(10)


Unnamed: 0_level_0,CountGroup
Group,Unnamed: 1_level_1
1,1
2,1
3,2
4,1
5,1
6,2
7,1
8,3
9,1
10,1


In [34]:
# Creating CountGroup - number of people in the group
df.insert(2, 'CountGroup', np.nan)

for index in count_group.index:
    df.loc[df['Group'] == index, 'CountGroup'] = \
    df.loc[df['Group'] == index, 'CountGroup'].fillna(count_group.loc[index][0])

df['CountGroup'] = df['CountGroup'].astype('int')

In [39]:
# Separar los elementos de Cabin
deck = df['Cabin'].str.split('/', expand = True)[0]
num = df['Cabin'].str.split('/', expand = True)[1]
side = df['Cabin'].str.split('/', expand = True)[2]

df['Cabin'] = deck
df = df.rename(columns = {'Cabin' : 'CabinType'})
df.insert(6, 'NumCabins', num)
df.insert(7, 'Side', side)


In [44]:
# Age
df['Age'].describe()

count    8514.000000
mean       28.827930
std        14.489021
min         0.000000
25%        19.000000
50%        27.000000
75%        38.000000
max        79.000000
Name: Age, dtype: float64

In [45]:
median = df['Age'].median()
df.loc[df['Age'].isna(),'Age'] = df.loc[df['Age'].isna(),'Age'].fillna(median)