# IO Operations and Useful Methods

| Format Type | Data Description      | Reader         | Writer       |
|-------------|-----------------------|----------------|--------------|
| text        | CSV                   | read_csv       | to_csv       |
| text        | JSON                  | read_json      | to_json      |
| text        | HTML                  | read_html      | to_html      |
| text        | Local clipboard       | read_clipboard | to_clipboard |
| binary      | MS Excel              | read_excel     | to_excel     |
| binary      | HDF5 Format           | read_hdf       | to_hdf       |
| SQL         | SQL                   | read_sql       | to_sql       |

In [1]:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
import pandas as pd

In [2]:
def df_info(df: pd.DataFrame) -> None:
    return df.head(n=20).style

## IO Operations

### Titanic Dataset

- PassengerId is the unique id of the row and it doesn't have any effect on target
- Survived is the target variable we are trying to predict (0 or 1):
    - 1 = Survived
    - 0 = Not Survived
- Pclass (Passenger Class) is the socio-economic status of the passenger and it is a categorical ordinal feature which has 3 unique values (1, 2 or 3):
    - 1 = Upper Class
    - 2 = Middle Class
    - 3 = Lower Class
- Name
- Sex
- Age
- SibSp is the total number of the passengers' siblings and spouse
- Parch is the total number of the passengers' parents and children
- Ticket is the ticket number of the passenger
- Fare is the passenger fare
- Cabin is the cabin number of the passenger
- Embarked is port of embarkation and it is a categorical feature which has 3 unique values (C, Q or S):
    - C = Cherbourg
    - Q = Queenstown
    - S = Southampton

*Embarked: sich einschiffen

In [3]:
df = pd.read_csv("../data/titanic/dataset.csv")
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [4]:
cols = [
    'Survived', 'Pclass',
    'Sex', 'Age', 'SibSp',
    'Parch', 'Fare',
    'Cabin', 'Embarked'
]

index_name = 'PassengerId'

In [5]:
df = pd.DataFrame(df[cols], index=df[index_name])
df.index.name = 'ID'

In [6]:
df_info(df)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C85,C
2,1.0,3.0,female,26.0,0.0,0.0,7.925,,S
3,1.0,1.0,female,35.0,1.0,0.0,53.1,C123,S
4,0.0,3.0,male,35.0,0.0,0.0,8.05,,S
5,0.0,3.0,male,,0.0,0.0,8.4583,,Q
6,0.0,1.0,male,54.0,0.0,0.0,51.8625,E46,S
7,0.0,3.0,male,2.0,3.0,1.0,21.075,,S
8,1.0,3.0,female,27.0,0.0,2.0,11.1333,,S
9,1.0,2.0,female,14.0,1.0,0.0,30.0708,,C
10,1.0,3.0,female,4.0,1.0,1.0,16.7,G6,S


In [7]:
df.iloc[:100].to_csv("../data/titanic/modified_dataset.csv")

In [8]:
df = pd.read_csv("../data/titanic/modified_dataset.csv")
df_info(df)

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C85,C
1,2,1.0,3.0,female,26.0,0.0,0.0,7.925,,S
2,3,1.0,1.0,female,35.0,1.0,0.0,53.1,C123,S
3,4,0.0,3.0,male,35.0,0.0,0.0,8.05,,S
4,5,0.0,3.0,male,,0.0,0.0,8.4583,,Q
5,6,0.0,1.0,male,54.0,0.0,0.0,51.8625,E46,S
6,7,0.0,3.0,male,2.0,3.0,1.0,21.075,,S
7,8,1.0,3.0,female,27.0,0.0,2.0,11.1333,,S
8,9,1.0,2.0,female,14.0,1.0,0.0,30.0708,,C
9,10,1.0,3.0,female,4.0,1.0,1.0,16.7,G6,S


In [9]:
df.iloc[:100].to_excel("../data/titanic/modified_dataset.xlsx")

In [10]:
df = pd.read_excel("../data/titanic/modified_dataset.xlsx")
df_info(df)

Unnamed: 0.1,Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,1,1,1,female,38.0,1,0,71.2833,C85,C
1,1,2,1,3,female,26.0,0,0,7.925,,S
2,2,3,1,1,female,35.0,1,0,53.1,C123,S
3,3,4,0,3,male,35.0,0,0,8.05,,S
4,4,5,0,3,male,,0,0,8.4583,,Q
5,5,6,0,1,male,54.0,0,0,51.8625,E46,S
6,6,7,0,3,male,2.0,3,1,21.075,,S
7,7,8,1,3,female,27.0,0,2,11.1333,,S
8,8,9,1,2,female,14.0,1,0,30.0708,,C
9,9,10,1,3,female,4.0,1,1,16.7,G6,S


In [11]:
df.iloc[:100].to_json("../data/titanic/modified_dataset.json")

In [12]:
df = pd.read_json("../data/titanic/modified_dataset.json")
df_info(df)

Unnamed: 0.1,Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,1,1,1,female,38.0,1,0,71.2833,C85,C
1,1,2,1,3,female,26.0,0,0,7.925,,S
2,2,3,1,1,female,35.0,1,0,53.1,C123,S
3,3,4,0,3,male,35.0,0,0,8.05,,S
4,4,5,0,3,male,,0,0,8.4583,,Q
5,5,6,0,1,male,54.0,0,0,51.8625,E46,S
6,6,7,0,3,male,2.0,3,1,21.075,,S
7,7,8,1,3,female,27.0,0,2,11.1333,,S
8,8,9,1,2,female,14.0,1,0,30.0708,,C
9,9,10,1,3,female,4.0,1,1,16.7,G6,S


In [13]:
# It is possibly that this fails, if so skip the two cells below
!conda install --yes pytables

Collecting package metadata (current_repodata.json): ...working... done

  current version: 4.8.3
  latest version: 4.10.1

Please update conda by running

    $ conda update -n base -c defaults conda



Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\schaf\Anaconda3\envs\pyData

  added / updated specs:
    - pytables


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    blosc-1.21.0               |       h19a0ad4_0         139 KB
    hdf5-1.10.6                |       h7ebc959_0         7.9 MB
    intel-openmp-2021.2.0      |     haa95532_616         1.8 MB
    lz4-c-1.9.3                |       h2bbff1b_0         131 KB
    mkl-2021.2.0               |     haa95532_296       115.5 MB
    mkl-service-2.3.0          |   py39h2bbff1b_1          49 KB
    mkl_fft-1.3.0              |   py39h277e83a_2         137 KB
    mkl_random-1.2.1           |

In [14]:
df.iloc[:100].to_hdf(
    "../data/titanic/modified_dataset.h5",
    key='df',
    mode='w'
)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['Sex', 'Cabin', 'Embarked'], dtype='object')]

  pytables.to_hdf(


In [15]:
df = pd.read_hdf("../data/titanic/modified_dataset.h5")
df_info(df)

Unnamed: 0.1,Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,1,1,1,female,38.0,1,0,71.2833,C85,C
1,1,2,1,3,female,26.0,0,0,7.925,,S
2,2,3,1,1,female,35.0,1,0,53.1,C123,S
3,3,4,0,3,male,35.0,0,0,8.05,,S
4,4,5,0,3,male,,0,0,8.4583,,Q
5,5,6,0,1,male,54.0,0,0,51.8625,E46,S
6,6,7,0,3,male,2.0,3,1,21.075,,S
7,7,8,1,3,female,27.0,0,2,11.1333,,S
8,8,9,1,2,female,14.0,1,0,30.0708,,C
9,9,10,1,3,female,4.0,1,1,16.7,G6,S


## Useful Methods

In [16]:
df_info(df.describe())

Unnamed: 0.1,Unnamed: 0,ID,Survived,Pclass,Age,SibSp,Parch,Fare
count,100.0,100.0,100.0,100.0,78.0,100.0,100.0,100.0
mean,49.5,50.5,0.41,2.4,27.542692,0.72,0.44,29.524083
std,29.011492,29.011492,0.494311,0.816497,15.266101,1.181336,0.967346,40.969411
min,0.0,1.0,0.0,1.0,0.83,0.0,0.0,7.225
25%,24.75,25.75,0.0,2.0,18.25,0.0,0.0,8.05
50%,49.5,50.5,0.0,3.0,26.5,0.0,0.0,15.675
75%,74.25,75.25,1.0,3.0,34.75,1.0,0.0,32.134375
max,99.0,100.0,1.0,3.0,71.0,5.0,5.0,263.0


In [17]:
df.agg(["min", "max", "median", "mean", "var", "std"])

Unnamed: 0.1,Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
min,0.0,1.0,0.0,1.0,female,0.83,0.0,0.0,7.225
max,99.0,100.0,1.0,3.0,male,71.0,5.0,5.0,263.0
median,49.5,50.5,0.0,3.0,,26.5,0.0,0.0,15.675
mean,49.5,50.5,0.41,2.4,,27.542692,0.72,0.44,29.524083
var,841.666667,841.666667,0.244343,0.666667,,233.053854,1.395556,0.935758,1678.492615
std,29.011492,29.011492,0.494311,0.816497,,15.266101,1.181336,0.967346,40.969411


In [18]:
print(df["Survived"].value_counts())

0    59
1    41
Name: Survived, dtype: int64
