In [1]:
import seaborn as sns
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)

# Task-1: Define the Titanic dataset from within the Seaborn library
df = sns.load_dataset("titanic")

# Task-2: Find the number of male and female passengers in the Titanic data set
print(df["sex"].value_counts())

male      577
female    314
Name: sex, dtype: int64


In [2]:
# Task-3: Find the number of unique values for each column
print(df.nunique())

survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64


In [3]:
# Task-4: Find the number of unique values
print(df['pclass'].nunique())

3


In [4]:
# Task-5: Find the number of unique values for pclass and parch column
print(df[['pclass', 'parch']].nunique())

pclass    3
parch     7
dtype: int64


In [5]:
# Task-6: Check the type of the embarked column. Change the type of the embarked column to Category and check that again.
print(df['embarked'].dtype)
df['embarked'] = df['embarked'].astype('category')
print(df['embarked'].dtype)

object
category


In [6]:
# Task-7: Show all information for those with embarked value c.
print((df[df['embarked'] == 'C']))

     survived  pclass     sex   age  sibsp  parch     fare embarked   class    who  adult_male deck embark_town alive  alone
1           1       1  female  38.0      1      0  71.2833        C   First  woman       False    C   Cherbourg   yes  False
9           1       2  female  14.0      1      0  30.0708        C  Second  child       False  NaN   Cherbourg   yes  False
19          1       3  female   NaN      0      0   7.2250        C   Third  woman       False  NaN   Cherbourg   yes   True
26          0       3    male   NaN      0      0   7.2250        C   Third    man        True  NaN   Cherbourg    no   True
30          0       1    male  40.0      0      0  27.7208        C   First    man        True  NaN   Cherbourg    no   True
..        ...     ...     ...   ...    ...    ...      ...      ...     ...    ...         ...  ...         ...   ...    ...
866         1       2  female  27.0      1      0  13.8583        C  Second  woman       False  NaN   Cherbourg   yes  False


In [7]:
# Task-8: Show all information for those with no embarked value S.
print((df[df['embarked'] != 'S']))

     survived  pclass     sex   age  sibsp  parch     fare embarked   class    who  adult_male deck embark_town alive  alone
1           1       1  female  38.0      1      0  71.2833        C   First  woman       False    C   Cherbourg   yes  False
5           0       3    male   NaN      0      0   8.4583        Q   Third    man        True  NaN  Queenstown    no   True
9           1       2  female  14.0      1      0  30.0708        C  Second  child       False  NaN   Cherbourg   yes  False
16          0       3    male   2.0      4      1  29.1250        Q   Third  child       False  NaN  Queenstown    no  False
19          1       3  female   NaN      0      0   7.2250        C   Third  woman       False  NaN   Cherbourg   yes   True
..        ...     ...     ...   ...    ...    ...      ...      ...     ...    ...         ...  ...         ...   ...    ...
875         1       3  female  15.0      0      0   7.2250        C   Third  child       False  NaN   Cherbourg   yes   True


In [8]:
# Task-9: Show all information for passengers younger than 30 years old and female.
print(df[(df["age"] < 30) & (df['who'] == 'woman')])

     survived  pclass     sex   age  sibsp  parch     fare embarked   class    who  adult_male deck  embark_town alive  alone
2           1       3  female  26.0      0      0   7.9250        S   Third  woman       False  NaN  Southampton   yes   True
8           1       3  female  27.0      0      2  11.1333        S   Third  woman       False  NaN  Southampton   yes  False
38          0       3  female  18.0      2      0  18.0000        S   Third  woman       False  NaN  Southampton    no  False
41          0       2  female  27.0      1      0  21.0000        S  Second  woman       False  NaN  Southampton    no  False
44          1       3  female  19.0      0      0   7.8792        Q   Third  woman       False  NaN   Queenstown   yes   True
..        ...     ...     ...   ...    ...    ...      ...      ...     ...    ...         ...  ...          ...   ...    ...
866         1       2  female  27.0      1      0  13.8583        C  Second  woman       False  NaN    Cherbourg   yes

In [9]:
# Task-10: Show information for passengers whose Fare is over 500 or 70 years old.
print(df[(df["fare"] > 500) | (df['age'] > 70)])

     survived  pclass     sex   age  sibsp  parch      fare embarked  class    who  adult_male deck  embark_town alive  alone
96          0       1    male  71.0      0      0   34.6542        C  First    man        True    A    Cherbourg    no   True
116         0       3    male  70.5      0      0    7.7500        Q  Third    man        True  NaN   Queenstown    no   True
258         1       1  female  35.0      0      0  512.3292        C  First  woman       False  NaN    Cherbourg   yes   True
493         0       1    male  71.0      0      0   49.5042        C  First    man        True  NaN    Cherbourg    no   True
630         1       1    male  80.0      0      0   30.0000        S  First    man        True    A  Southampton   yes   True
679         1       1    male  36.0      0      1  512.3292        C  First    man        True    B    Cherbourg   yes  False
737         1       1    male  35.0      0      0  512.3292        C  First    man        True    B    Cherbourg   yes

In [10]:
# Task-11: Find the sum of the null values in each variable.
print(df.isnull().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [11]:
# Task-12: Remove who variable from the data set.
df = df.drop("who", axis=1)
print(df)

     survived  pclass     sex   age  sibsp  parch     fare embarked   class  adult_male deck  embark_town alive  alone
0           0       3    male  22.0      1      0   7.2500        S   Third        True  NaN  Southampton    no  False
1           1       1  female  38.0      1      0  71.2833        C   First       False    C    Cherbourg   yes  False
2           1       3  female  26.0      0      0   7.9250        S   Third       False  NaN  Southampton   yes   True
3           1       1  female  35.0      1      0  53.1000        S   First       False    C  Southampton   yes  False
4           0       3    male  35.0      0      0   8.0500        S   Third        True  NaN  Southampton    no   True
..        ...     ...     ...   ...    ...    ...      ...      ...     ...         ...  ...          ...   ...    ...
886         0       2    male  27.0      0      0  13.0000        S  Second        True  NaN  Southampton    no   True
887         1       1  female  19.0      0      

In [12]:
# Task-13: Fill in the empty values in the deck variable with the most repeated value (mode) of the deck variable.
df['deck'].fillna(df['deck'].mode()[0], inplace=True)
print(df['deck'].isnull().sum())
print(df['deck'])

0
0      C
1      C
2      C
3      C
4      C
      ..
886    C
887    B
888    C
889    C
890    C
Name: deck, Length: 891, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']


In [13]:
# Task-14: Fill in the blank values in the age variable with the median of the age variable.
print(df['age'].median())
df["age"].fillna(df['age'].median(), inplace=True)
print(df['age'])

28.0
0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64


In [14]:
# Task-15: Find the sum, count, mean values of the pclass and gender variables of the survived variable.
print(df.groupby(['pclass', 'sex'])['survived'].agg(['sum', 'count', 'mean']))

               sum  count      mean
pclass sex                         
1      female   91     94  0.968085
       male     45    122  0.368852
2      female   70     76  0.921053
       male     17    108  0.157407
3      female   72    144  0.500000
       male     47    347  0.135447


In [15]:
# Task-16: Write a function that gives 1 to those under 30 and 0 to those above and equal to 30.
# Create a variable named age_flag in the titanic dataset using the function you wrote (use apply and lambda constructs).

df["age_flag"] = df["age"].apply(lambda age: 1 if age < 30 else 0)
print(df)

     survived  pclass     sex   age  sibsp  parch     fare embarked   class  adult_male deck  embark_town alive  alone  age_flag
0           0       3    male  22.0      1      0   7.2500        S   Third        True    C  Southampton    no  False         1
1           1       1  female  38.0      1      0  71.2833        C   First       False    C    Cherbourg   yes  False         0
2           1       3  female  26.0      0      0   7.9250        S   Third       False    C  Southampton   yes   True         1
3           1       1  female  35.0      1      0  53.1000        S   First       False    C  Southampton   yes  False         0
4           0       3    male  35.0      0      0   8.0500        S   Third        True    C  Southampton    no   True         0
..        ...     ...     ...   ...    ...    ...      ...      ...     ...         ...  ...          ...   ...    ...       ...
886         0       2    male  27.0      0      0  13.0000        S  Second        True    C  Sou

In [16]:
# Task-17: Define the tips data set from the Seaborn library.
df_tips = sns.load_dataset('tips')
print(df_tips)

     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]


In [17]:
# Task-18: Find the sum, min, max and average of the total_bill values according to the categories (Dinner, Lunch) of the time variable.
print(df_tips.groupby(['time'])['total_bill'].agg(['min', 'max', 'mean']))

         min    max       mean
time                          
Lunch   7.51  43.11  17.168676
Dinner  3.07  50.81  20.797159


In [18]:
# Task-19: Find the sum, min, max and average of total_bill values according to day and time variables.
print(df_tips.groupby(['day', 'time']).agg({'total_bill': ['sum', 'min', 'max', 'mean']}))

            total_bill                         
                   sum    min    max       mean
day  time                                      
Thur Lunch     1077.55   7.51  43.11  17.664754
     Dinner      18.78  18.78  18.78  18.780000
Fri  Lunch       89.92   8.58  16.27  12.845714
     Dinner     235.96   5.75  40.17  19.663333
Sat  Lunch        0.00    NaN    NaN        NaN
     Dinner    1778.40   3.07  50.81  20.441379
Sun  Lunch        0.00    NaN    NaN        NaN
     Dinner    1627.16   7.25  48.17  21.410000


In [19]:
# Task-20: Find the sum, min, max and average of the total_bill and type values of the lunch time and female customers according to the day
print(df_tips[(df_tips['time'] == "Lunch") & (df_tips['sex'] == "Female")].groupby(['day']).agg({'total_bill':['min', 'max', 'mean'],
                                                                             'tip': ['min', 'max', 'mean']}))

     total_bill                    tip                
            min    max      mean   min   max      mean
day                                                   
Thur       8.35  43.11  16.64871  1.25  5.17  2.561935
Fri       10.09  16.27  13.94000  2.00  3.48  2.745000
Sat         NaN    NaN       NaN   NaN   NaN       NaN
Sun         NaN    NaN       NaN   NaN   NaN       NaN


In [20]:
# Task-21: Use loc to find the average of orders with a size value less than 3 and a total_bill value greater than 10.
print(df_tips.loc[(df_tips['size'] < 3) & (df_tips['total_bill'] > 10), :].mean(numeric_only=True))

total_bill    17.184965
tip            2.638811
size           1.993007
dtype: float64


In [21]:
# Task-22: total_bill_tip_sum entities are your new variable creator. Returns the total of the total bill and tip paid for each of his lifetimes.
df_tips['total_bill_tip_sum'] = df_tips['total_bill'] + df_tips['tip']
print(df_tips)

     total_bill   tip     sex smoker   day    time  size  total_bill_tip_sum
0         16.99  1.01  Female     No   Sun  Dinner     2               18.00
1         10.34  1.66    Male     No   Sun  Dinner     3               12.00
2         21.01  3.50    Male     No   Sun  Dinner     3               24.51
3         23.68  3.31    Male     No   Sun  Dinner     2               26.99
4         24.59  3.61  Female     No   Sun  Dinner     4               28.20
..          ...   ...     ...    ...   ...     ...   ...                 ...
239       29.03  5.92    Male     No   Sat  Dinner     3               34.95
240       27.18  2.00  Female    Yes   Sat  Dinner     2               29.18
241       22.67  2.00    Male    Yes   Sat  Dinner     2               24.67
242       17.82  1.75    Male     No   Sat  Dinner     2               19.57
243       18.78  3.00  Female     No  Thur  Dinner     2               21.78

[244 rows x 8 columns]


In [22]:
# Task-23: Sort the data set from largest to smallest according to the total_bill_tip_sum variable and assign the first 30
df_t = df_tips.sort_values(by='total_bill_tip_sum', ascending=False).head(30)
print(df_t)

     total_bill    tip     sex smoker   day    time  size  total_bill_tip_sum
170       50.81  10.00    Male    Yes   Sat  Dinner     3               60.81
212       48.33   9.00    Male     No   Sat  Dinner     4               57.33
59        48.27   6.73    Male     No   Sat  Dinner     4               55.00
156       48.17   5.00    Male     No   Sun  Dinner     6               53.17
182       45.35   3.50    Male    Yes   Sun  Dinner     3               48.85
197       43.11   5.00  Female    Yes  Thur   Lunch     4               48.11
23        39.42   7.58    Male     No   Sat  Dinner     4               47.00
102       44.30   2.50  Female    Yes   Sat  Dinner     3               46.80
142       41.19   5.00    Male     No  Thur   Lunch     5               46.19
95        40.17   4.73    Male    Yes   Fri  Dinner     4               44.90
184       40.55   3.00    Male    Yes   Sun  Dinner     2               43.55
112       38.07   4.00    Male     No   Sun  Dinner     3       