In [None]:
import pandas as pd
import seaborn as sns
#load the data from seaborn into a dataframe
df = sns.load_dataset("titanic") #titanic dataset comes with seaborn
df.head()




Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
#transform some columns for readability, renames them so that they are easier to read
df = df.rename(columns={
    "sibsp": "siblings_spouses",
    "parch": "parents_children"
})
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'siblings_spouses',
       'parents_children', 'fare', 'embarked', 'class', 'who', 'adult_male',
       'deck', 'embark_town', 'alive', 'alone'],
      dtype='object')

In [None]:
#find out what data types they are
df.dtypes


survived               int64
pclass                 int64
sex                   object
age                  float64
siblings_spouses       int64
parents_children       int64
fare                 float64
embarked              object
class               category
who                   object
adult_male              bool
deck                category
embark_town           object
alive                 object
alone                   bool
dtype: object

In [7]:
'''
data types can be changed to to make them more suitable, i.e. take us less space in memory 
e.g. survived is a 0 or a 1 so it does not need to be an int64, could just be an int8
'''
#in to astype goes a python dict of the column name then what data type you want to change it to
df = df.astype({
    'survived':'Int8',
    'pclass': 'Int8',
    'age': 'Float32',
    'fare': 'Float32'
})
df.dtypes

survived                Int8
pclass                  Int8
sex                   object
age                  Float32
siblings_spouses       int64
parents_children       int64
fare                 Float32
embarked              object
class               category
who                   object
adult_male              bool
deck                category
embark_town           object
alive                 object
alone                   bool
dtype: object

In [None]:
#make categoricals explicit i.e. all things which are object type (have discreet values) changing them into category types helps you find if there are mistakes in the data
for col in ["sex", "embarked", "class", "who", "adult_male"]:
    if col in df.columns: #checks if the column name "col" exists in the dataframe
        df[col] = df[col].astype("category") #this replaces the thing in the dataframe with the same thing but now a category

'''
Internally Pandas stores a categorical column as:
a list of categories (unique values), and
an integer code for each row referencing which category it is.
Benefits:
Lower memory usage (especially useful for large datasets with repeated strings).
Faster groupby/aggregation/sorting in many cases.
Makes the columns set of possible values explicit (useful for data validation).
Preserves NaN values. Works for strings, booleans, numbers — anything hashable.
'''
df.dtypes



survived                Int8
pclass                  Int8
sex                 category
age                  Float32
siblings_spouses       int64
parents_children       int64
fare                 Float32
embarked            category
class               category
who                 category
adult_male          category
deck                category
embark_town           object
alive                 object
alone                   bool
dtype: object

In [13]:
#handling nulls
#suming the nulls in each col
df.isna().sum()

#change data or delete rows
#simple, defensive fixes 
df['age'] = df['age'].fillna(df['age'].median()) #fillna fills in the ages, in this case with the median age
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0]) # categorical -> mode
df = df.dropna(subset=['fare']) #drop if missing fare (not missing fares)

df.isna().sum()

survived              0
pclass                0
sex                   0
age                   0
siblings_spouses      0
parents_children      0
fare                  0
embarked              0
class                 0
who                   0
adult_male            0
deck                688
embark_town           2
alive                 0
alone                 0
dtype: int64

In [None]:
#create a new column
# family_size = self + siblings/spouses + parent/children

df["family_size"] = df['siblings_spouses'] + df['parents_children'] + 1

#age bands (ordinal categorical) 
#cut puts things into bins
df['age_group'] = pd.cut(
    df['age'],
    bins=[0, 12, 18, 35, 55, 120],
    labels=['child', 'teen', 'young adult', 'adult', 'senior'],
    right=True
)

#fare per family using the family column I just made
df['fare_per_person'] = (df['fare'] / df['family_size']).round(2)
df.head(3)

Unnamed: 0,survived,pclass,sex,age,siblings_spouses,parents_children,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family_size,age_group,fare_per_person
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,2,young adult,3.62
1,1,1,female,38.0,1,0,71.283302,C,First,woman,False,C,Cherbourg,yes,False,2,adult,35.64
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,1,young adult,7.93


In [22]:
#data quality checks duplicates (across a sensible subset of columns)
dupes = df.duplicated(subset=[
    'pclass', 'sex', 'age', 'fare', 'siblings_spouses', 'parents_children']).sum()
print('Duplicate rows: ', dupes)

#invalid ranges / values
invalid = {
    'negative_fare': int((df['fare'] < 0).sum()),
    'negative_age': int((df['age'] < 0).sum()),
    'bad_pclass': int((~df['pclass'].isin([1, 2, 3])).sum()),
    'bad_sex': int((~df['sex'].isin(["male", "female"])).sum())  
}
invalid

Duplicate rows:  136


{'negative_fare': 0, 'negative_age': 0, 'bad_pclass': 0, 'bad_sex': 0}