# IO Operations and Statistical Method

In [66]:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
import pandas as pd

In [67]:
def df_info(df: pd.DataFrame) -> None:
    return df.head(n=20).style

## File Operations

In [68]:
cols = [
    'Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
    'Parch', 'Fare', 'Cabin', 'Embarked'
]

df = pd.read_csv("../data/titanic/dataset.csv")
print(df.columns)

df = pd.DataFrame(df[cols], index=df["PassengerId"])
df.index.name = 'ID'

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [69]:
df_info(df)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C85,C
2,1.0,3.0,female,26.0,0.0,0.0,7.925,,S
3,1.0,1.0,female,35.0,1.0,0.0,53.1,C123,S
4,0.0,3.0,male,35.0,0.0,0.0,8.05,,S
5,0.0,3.0,male,,0.0,0.0,8.4583,,Q
6,0.0,1.0,male,54.0,0.0,0.0,51.8625,E46,S
7,0.0,3.0,male,2.0,3.0,1.0,21.075,,S
8,1.0,3.0,female,27.0,0.0,2.0,11.1333,,S
9,1.0,2.0,female,14.0,1.0,0.0,30.0708,,C
10,1.0,3.0,female,4.0,1.0,1.0,16.7,G6,S


In [70]:
df.iloc[:100].to_csv("../data/titanic/modified_dataset.csv")

In [71]:
df.iloc[:100].to_excel("../data/titanic/modified_dataset.xlsx")

In [72]:
df.iloc[:100].to_json("../data/titanic/modified_dataset.json")

In [73]:
df.iloc[:100].to_markdown("../data/titanic/modified_dataset.md")

In [74]:
df = pd.read_excel("../data/titanic/modified_dataset.xlsx")

df_info(df)

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,1,1,female,38.0,1,0,71.2833,C85,C
1,2,1,3,female,26.0,0,0,7.925,,S
2,3,1,1,female,35.0,1,0,53.1,C123,S
3,4,0,3,male,35.0,0,0,8.05,,S
4,5,0,3,male,,0,0,8.4583,,Q
5,6,0,1,male,54.0,0,0,51.8625,E46,S
6,7,0,3,male,2.0,3,1,21.075,,S
7,8,1,3,female,27.0,0,2,11.1333,,S
8,9,1,2,female,14.0,1,0,30.0708,,C
9,10,1,3,female,4.0,1,1,16.7,G6,S


### Statistical Functions

In [75]:
print(df["Age"].cov(df["Survived"]))

-0.4674475524475526


In [76]:
print(df["Age"].corr(df["Survived"]))

-0.0621681056311732


In [77]:
df_info(df.describe())

Unnamed: 0,ID,Survived,Pclass,Age,SibSp,Parch,Fare
count,100.0,100.0,100.0,78.0,100.0,100.0,100.0
mean,50.5,0.41,2.4,27.542692,0.72,0.44,29.524083
std,29.011492,0.494311,0.816497,15.266101,1.181336,0.967346,40.969411
min,1.0,0.0,1.0,0.83,0.0,0.0,7.225
25%,25.75,0.0,2.0,18.25,0.0,0.0,8.05
50%,50.5,0.0,3.0,26.5,0.0,0.0,15.675
75%,75.25,1.0,3.0,34.75,1.0,0.0,32.134375
max,100.0,1.0,3.0,71.0,5.0,5.0,263.0


In [78]:
df.agg(["min", "max", "median", "mean", "var", "std"])

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
min,1.0,0.0,1.0,female,0.83,0.0,0.0,7.225
max,100.0,1.0,3.0,male,71.0,5.0,5.0,263.0
median,50.5,0.0,3.0,,26.5,0.0,0.0,15.675
mean,50.5,0.41,2.4,,27.542692,0.72,0.44,29.524083
var,841.666667,0.244343,0.666667,,233.053854,1.395556,0.935758,1678.492615
std,29.011492,0.494311,0.816497,,15.266101,1.181336,0.967346,40.969411


In [79]:
print(df["Survived"].value_counts())

0    59
1    41
Name: Survived, dtype: int64
