In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

*Exercise: Load the 'tips' dataset from seaborn library and perform the following data cleaning and preparation tasks:*
- Drop any missing values from the dataset.
- Convert the 'tip' column to integer values by rounding it to the nearest integer.
- Create a new column called 'tip_percentage' which represents the percentage of tip given by the customer, rounded to two decimal places.
- Group the data by 'day' and 'time' and calculate the average tip_percentage for each group.

### Drop Missing Values

In [2]:
tips = sns.load_dataset('tips')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


**NOTE:** Dataset does not have any missing values

### Convert 'tip' column to integer by rounding to nearest integer

In [3]:
def nearest_integer(value: np.float64) -> np.int64:
    return round(value, 0)

tips['tip_int'] = tips['tip'].map(nearest_integer)
tips['tip_int'].dtype

# Another way
tips['tip'].round().astype(int).dtype

dtype('int32')

### Create a new column names 'tip_percentage' (percentage of tip given rounded to two decimal places)

In [4]:
tips['tip_percentage'] = tips['tip'].div(tips['total_bill']).round(2)
tips['tip_percentage']

0      0.06
1      0.16
2      0.17
3      0.14
4      0.15
       ... 
239    0.20
240    0.07
241    0.09
242    0.10
243    0.16
Name: tip_percentage, Length: 244, dtype: float64

### Group data by 'day' and 'time' and calculate the average tip_percentage for each group

In [5]:
tips.groupby(['day', 'time'])['tip_percentage'].mean()

day   time  
Thur  Lunch     0.161148
      Dinner    0.160000
Fri   Lunch     0.188571
      Dinner    0.158333
Sat   Lunch          NaN
      Dinner    0.153678
Sun   Lunch          NaN
      Dinner    0.166974
Name: tip_percentage, dtype: float64

## Exercise 2: Load the 'titanic' dataset from seaborn library and perform the following data cleaning and preparation tasks:

- Drop any missing values from the dataset.
- Create a new column called 'family_size' which represents the total number of family members (including the passenger) on board for each passenger.
- Replace the values in the 'sex' column with 0 for 'male' and 1 for 'female'.
- Create a new column called 'age_group' which represents the age group of each passenger. The age groups should be as follows:
    - Child: age < 18
    - Adult: 18 <= age < 60
    - Senior: age >= 60
- Group the data by 'sex', 'pclass', and 'age_group' and calculate the survival rate for each group.

### Drop any missing value from dataset

In [6]:
titanic = sns.load_dataset('titanic')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [7]:
titanic = titanic.drop('deck', axis='columns')
titanic = titanic.dropna()

In [8]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    object  
 3   age          712 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     712 non-null    object  
 8   class        712 non-null    category
 9   who          712 non-null    object  
 10  adult_male   712 non-null    bool    
 11  embark_town  712 non-null    object  
 12  alive        712 non-null    object  
 13  alone        712 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 69.0+ KB


### Create a new column called 'family_size' which represents the total number of family members (including the passenger) on board for each 

In [9]:
titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1
titanic['family_size'].value_counts()


1    402
2    139
3     93
4     27
6     22
7     12
5     11
8      6
Name: family_size, dtype: int64

### Replace the values in the 'sex' column with 0 for 'male' and 1 for 'female'.

In [10]:
string_to_int_map = lambda sex: 1 if sex == 'female' else 0

titanic['sex'] = titanic['sex'].map(string_to_int_map)
titanic['sex']

# Another way
titanic['sex'] = titanic['sex'].replace({'male': 0, 'female': 1})

- Create a new column called 'age_group' which represents the age group of each passenger. The age groups should be as follows:
    - Child: age < 18
    - Adult: 18 <= age < 60
    - Senior: age >= 60

In [11]:
age_categorical_map = lambda age: 'Child' if age < 19 else 'Adult' if age < 60 else 'Senior'

titanic['age_group'] = titanic['age'].map(age_categorical_map)
titanic['age_group'].value_counts()

Adult     548
Child     139
Senior     25
Name: age_group, dtype: int64

### Group the data by 'sex', 'pclass', and 'age_group' and calculate the survival rate for each group.

In [12]:
from pandas.core.groupby.generic import DataFrameGroupBy


def survival_rate_map(grouped: DataFrameGroupBy):
    return grouped['survived'].count() / titanic['survived']


survival_rate = titanic.groupby(['sex', 'pclass', 'age_group'])['survived'].mean()
survival_rate.map(lambda survival_rate: round(survival_rate * 100, 2))


sex  pclass  age_group
0    1       Adult         41.46
             Child         80.00
             Senior        14.29
     2       Adult          6.25
             Child         60.00
             Senior        25.00
     3       Adult         13.64
             Child         21.57
             Senior         0.00
1    1       Adult         97.14
             Child         90.91
             Senior       100.00
     2       Adult         90.00
             Child        100.00
     3       Adult         41.38
             Child         51.16
             Senior       100.00
Name: survived, dtype: float64

## Exercise 3: Load the 'diamonds' dataset from seaborn library and perform the following data cleaning and preparation tasks:

- Drop any missing values from the dataset.
- Create a new column called 'price_per_carat' which represents the price per carat of each diamond.
- Replace the values in the 'cut' column with 1 for 'Fair', 2 for 'Good', 3 for 'Very Good', 4 for 'Premium', and 5 for 'Ideal'.
- Create a new column called 'carat_group' which represents the carat group of each diamond. The carat groups should be as follows:
    - Small: carat < 0.5
    - Medium: 0.5 <= carat < 1.0
    - Large: carat >= 1.0
- Group the data by 'cut', 'carat_group', and 'color' and calculate the average price per carat for each group.

### - Drop any missing values from the dataset.

In [13]:
diamonds = sns.load_dataset('diamonds')
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


### Conclusion
    - No Null fields

### Create a new column called 'price_per_carat' which represents the price per carat of each diamond.

In [14]:
diamonds['price_per_carat'] = diamonds['price'] / diamonds['carat']
diamonds['price_per_carat'].head()

0    1417.391304
1    1552.380952
2    1421.739130
3    1151.724138
4    1080.645161
Name: price_per_carat, dtype: float64

### Replace the values in the 'cut' column with 1 for 'Fair', 2 for 'Good', 3 for 'Very Good', 4 for 'Premium', and 5 for 'Ideal'.

In [15]:
diamonds['cut'] = diamonds['cut'].rename({1: 'Fair', 2: 'Good', 3: 'Very Good', 4: 'Premium', 5: 'Ideal'})
diamonds['cut'].value_counts()

Ideal        21551
Premium      13789
Very Good    12081
Good          4904
Fair          1610
Name: cut, dtype: int64

### Create a new column called 'carat_group' which represents the carat group of each diamond. The carat groups should be as follows:
- Small: carat < 0.5
- Medium: 0.5 <= carat < 1.0
- Large: carat >= 1.0

In [28]:
diamonds['carat_group'] = pd.cut(diamonds['carat'], [-float('inf'), .5, 1., float('inf')], labels=['small', 'medium', 'large'], right=False, )
diamonds['carat_group'].value_counts()

# Another Solution
diamonds.loc[(diamonds['carat'] >= 0.5) & (diamonds['carat'] < 1.0), 'carat_group'] = 'Medium'
diamonds.loc[diamonds['carat'] >= 1.0, 'carat_group'] = 'Large'


large     19060
small     17674
medium    17206
Name: carat_group, dtype: int64

### Group the data by 'cut', 'carat_group', and 'color' and calculate the average price per carat for each group.

In [29]:
diamonds.groupby(['cut', 'carat_group', 'color'])['price_per_carat'].mean()

cut    carat_group  color
Ideal  small        D        2569.207086
                    E        2569.835337
                    F        2531.799035
                    G        2404.310577
                    H        2122.664894
                                ...     
Fair   large        F        4910.175870
                    G        4637.357089
                    H        4460.076019
                    I        4001.499980
                    J        3855.545718
Name: price_per_carat, Length: 105, dtype: float64

### Hard Exercise: Load the 'titanic' dataset from seaborn library and perform the following data cleaning and preparation tasks:

- Remove the 'deck' column and any rows that have missing values in the 'age' column.
- Replace the missing values in the 'embark_town' column with the mode (most frequent) value.
- Create a new column called 'family_size' which combines the 'sibsp' and 'parch' columns to give the total number of family members on board.
- Group the data by 'class' and 'sex' and calculate the following metrics for each group:
    - 'avg_age': the average age of passengers.
    - 'pct_survived': the percentage of passengers who survived.
    - 'pct_female': the percentage of female passengers.

### Convert the 'year', 'month', and 'day' columns into a single 'date' column with a datetime data type.

In [35]:
titanic = sns.load_dataset('titanic')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [36]:
titanic = titanic.drop('deck', axis='columns')
titanic = titanic.dropna()
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    object  
 3   age          712 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     712 non-null    object  
 8   class        712 non-null    category
 9   who          712 non-null    object  
 10  adult_male   712 non-null    bool    
 11  embark_town  712 non-null    object  
 12  alive        712 non-null    object  
 13  alone        712 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 69.0+ KB


### Replace the missing values in the 'embark_town' column with the mode (most frequent) value.

In [38]:
titanic['embark_town'].value_counts() 

(Southampton    554
 Cherbourg      130
 Queenstown      28
 Name: embark_town, dtype: int64,
 712)

### Conclusion
    - Since 554 + 130 + 28 = 712 we can observe there are no missing values to replace

### Create a new column called 'family_size' which combines the 'sibsp' and 'parch' columns to give the total number of family members on board.
    - Note: Already done earlier

### Group the data by 'class' and 'sex' and calculate the following metrics for each group:
    - 'avg_age': the average age of passengers.
    - 'pct_survived': the percentage of passengers who survived.
    - 'pct_female': the percentage of female passengers.

In [56]:
grouped = titanic.groupby(['class', 'sex']).agg(
    avg_age=('age', 'mean'),
    pct_survived=('survived', 'mean'),
    pct_female=('sex', lambda x: (x == 'female').mean())
).reset_index()
grouped

Unnamed: 0,class,sex,avg_age,pct_survived,pct_female
0,First,female,34.240964,0.963855,1.0
1,First,male,41.281386,0.39604,0.0
2,Second,female,28.722973,0.918919,1.0
3,Second,male,30.740707,0.151515,0.0
4,Third,female,21.75,0.460784,1.0
5,Third,male,26.507589,0.150198,0.0
