In [28]:
# Ignore SQLITE warnings related to Decimal numbers in the titanic database
import warnings
warnings.filterwarnings('ignore')

In [29]:
# Import Dependencies
import pandas as pd
from sqlalchemy.orm import sessionmaker
from ydata_profiling import ProfileReport

In [30]:
# Create an engine for the titanic.sqlite database
engine = create_engine("sqlite:///titanic.sqlite", echo=False)

In [31]:
# Reflect Database into ORM classes
Base = automap_base()
Base.prepare(autoload_with=engine)
Base.classes.keys()

['passenger']

In [32]:
# Save a reference to the passenger table as `Passenger`
Passenger = Base.classes.passenger

In [33]:
# Create a session
Session = sessionmaker(bind=engine)
session = Session()

In [34]:
# Query the Passenger table using the session
query = session.query(Passenger).statement

In [35]:
# Read the query into a DataFrame
df = pd.read_sql(query, session.bind)

In [36]:
print(df.head())

   id                                           name pclass    age     sex  \
0   1                   Allen, Miss Elisabeth Walton    1st  29.00  female   
1   2                    Allison, Miss Helen Loraine    1st   2.00  female   
2   3            Allison, Mr Hudson Joshua Creighton    1st  30.00    male   
3   4  Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1st  25.00  female   
4   5                  Allison, Master Hudson Trevor    1st   0.92    male   

   survived  
0         1  
1         0  
2         0  
3         0  
4         1  


In [37]:
df.shape

(1313, 6)

In [38]:
df.dtypes

id            int64
name         object
pclass       object
age         float64
sex          object
survived      int64
dtype: object

In [39]:
df[df.duplicated()] 

Unnamed: 0,id,name,pclass,age,sex,survived


In [40]:
df.isna().sum()

id            0
name          0
pclass        0
age         557
sex           0
survived      0
dtype: int64

In [41]:
df.isna().sum().sum() # number of missing cells

557

In [42]:
round(df.isna().sum().sum() / df.size * 100, 1) # percentage of missing cells

7.1

In [43]:
df.describe(include='object')

Unnamed: 0,name,pclass,sex
count,1313,1313,1313
unique,1310,3,2
top,"Connolly, Miss Kate",3rd,male
freq,2,711,851


In [44]:
cat_cols = ['name', 'age', 'sex','survived']

for col in cat_cols:
    categories = df.groupby(col).size()
    print(categories)

name
Abbing, Mr Anthony                              1
Abbott, Master Eugene Joseph                    1
Abbott, Mr Rossmore Edward                      1
Abbott, Mrs Stanton (Rosa)                      1
Abelseth, Miss Anna Karen                       1
                                               ..
Zimmerman, Leo                                  1
de Brito, Mr Jose Joaquim                       1
de Villiers, Madame Berthe                      1
del Carlo, Mr Sebastiano                        1
del Carlo, Mrs Sebastiano (Argenia Genovese)    1
Length: 1310, dtype: int64
age
0.17     1
0.33     1
0.80     1
0.83     2
0.92     1
        ..
65.00    2
67.00    1
69.00    1
70.00    1
71.00    3
Length: 75, dtype: int64
sex
female    462
male      851
dtype: int64
survived
0    863
1    450
dtype: int64


In [45]:
# Generate the report
profile = ProfileReport(df,title="Titanic Profile")
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [46]:
# Save the report to .html
profile.to_file("Titanic_analysis.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]