In [15]:
import numpy as np
import pandas as pd
import seaborn as sns

*Exercise: Load the 'tips' dataset from seaborn library and perform the following data cleaning and preparation tasks:*
- Drop any missing values from the dataset.
- Convert the 'tip' column to integer values by rounding it to the nearest integer.
- Create a new column called 'tip_percentage' which represents the percentage of tip given by the customer, rounded to two decimal places.
- Group the data by 'day' and 'time' and calculate the average tip_percentage for each group.

### Drop Missing Values

In [16]:
tips = sns.load_dataset('tips')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


**NOTE:** Dataset does not have any missing values

### Convert 'tip' column to integer by rounding to nearest integer

In [24]:
def nearest_integer(value: np.float64) -> np.int64:
    return round(value, 0)

tips['tip_int'] = tips['tip'].map(nearest_integer)
tips['tip_int'].dtype

# Another way
tips['tip'].round().astype(int).dtype

dtype('int32')

### Create a new column names 'tip_percentage' (percentage of tip given rounded to two decimal places)

In [18]:
tips['tip_percentage'] = tips['tip'].div(tips['total_bill']).round(2)
tips['tip_percentage']

0      0.06
1      0.16
2      0.17
3      0.14
4      0.15
       ... 
239    0.20
240    0.07
241    0.09
242    0.10
243    0.16
Name: tip_percentage, Length: 244, dtype: float64

### Group data by 'day' and 'time' and calculate the average tip_percentage for each group

In [22]:
tips.groupby(['day', 'time'])['tip_percentage'].mean()

day   time  
Thur  Lunch     0.161148
      Dinner    0.160000
Fri   Lunch     0.188571
      Dinner    0.158333
Sat   Lunch          NaN
      Dinner    0.153678
Sun   Lunch          NaN
      Dinner    0.166974
Name: tip_percentage, dtype: float64

## Exercise 2: Load the 'titanic' dataset from seaborn library and perform the following data cleaning and preparation tasks:

- Drop any missing values from the dataset.
- Create a new column called 'family_size' which represents the total number of family members (including the passenger) on board for each passenger.
- Replace the values in the 'sex' column with 0 for 'male' and 1 for 'female'.
- Create a new column called 'age_group' which represents the age group of each passenger. The age groups should be as follows:
    - Child: age < 18
    - Adult: 18 <= age < 60
    - Senior: age >= 60
- Group the data by 'sex', 'pclass', and 'age_group' and calculate the survival rate for each group.

### Drop any missing value from dataset

In [32]:
titanic = sns.load_dataset('titanic')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [33]:
titanic = titanic.drop('deck', axis='columns')
titanic = titanic.dropna()

In [34]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    object  
 3   age          712 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     712 non-null    object  
 8   class        712 non-null    category
 9   who          712 non-null    object  
 10  adult_male   712 non-null    bool    
 11  embark_town  712 non-null    object  
 12  alive        712 non-null    object  
 13  alone        712 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 69.0+ KB
