In [None]:
# Import libraries
import pandas as pd
import numpy as np

# Load dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Preview the data
print("First 5 rows:")
print(df.head())

# Shape and info
print("\nDataset shape:", df.shape)
print("\nDataset Info:")
print(df.info())

# Select specific columns
selected = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare']]
print("\nSelected Columns Preview:")
print(selected.head())

# Descriptive statistics
print("\nDescriptive Statistics:")
print(selected.describe())

# Value counts (categorical data)
print("\nCount of Survived (0 = No, 1 = Yes):")
print(df['Survived'].value_counts())

print("\nPassenger Class Counts:")
print(df['Pclass'].value_counts())

print("\nGender Counts:")
print(df['Sex'].value_counts())

# Data cleaning: Fill missing Age values with mean
mean_age = df['Age'].mean()
df['Age'].fillna(mean_age, inplace=True)

# Aggregation Example: Mean Fare by Class
mean_fare = df.groupby('Pclass')['Fare'].mean()
print("\nAverage Fare by Class:")
print(mean_fare)

# NumPy Calculations
fare_array = df['Fare'].values
print("\nFare Stats with NumPy:")
print("Mean Fare:", np.mean(fare_array))
print("Max Fare:", np.max(fare_array))
print("Standard Deviation:", np.std(fare_array))

# Sorting: Top 5 fares
print("\nTop 5 Passengers by Fare:")
print(df[['Name', 'Fare']].sort_values(by='Fare', ascending=False).head())

# Boolean Indexing: Passengers under 18
kids = df[df['Age'] < 18]
print("\nNumber of Passengers under 18:", kids.shape[0])
print(kids[['Name', 'Age', 'Survived']].head())


First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(mean_age, inplace=True)
