# Step-by-Step Pandas Tutorial Using the Titanic Dataset
Let's create a basic tutorial on using pandas for data manipulation and matplotlib for visualization with the Titanic dataset.

## Step 1: Install Required Libraries
Ensure you have pandas and matplotlib installed. You can install them using pip if you haven't already:

In [None]:
!pip install pandas matplotlib

## Step 2: Import Necessary Libraries
First, we need to import the required libraries:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

We'll download the dataset from the internet and load it into a `pandas` dataframe.

In [None]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(url)

In [None]:
# Display the dataset
titanic

## Step 4: Explore the Dataset
Let's take a quick look at the dataset:

In [None]:
print(titanic.info())

In [None]:
print(titanic.describe())

## Step 5: Data Cleaning
Before analysis, we need to clean the data. We'll handle missing values and convert necessary columns to the correct data types:

In [None]:
# Fill missing age values with the median age
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)

In [None]:
# Fill missing embarked values with the most common port
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

In [None]:
# Drop the Cabin column due to too many missing values
titanic.drop(columns=['Cabin'], inplace=True)

In [None]:
# Convert 'Survived' column to a categorical type
titanic['Survived'] = titanic['Survived'].astype('category')

## Step 6: Data Analysis
Now, let's perform some basic data analysis.

### 6.1 Survival Rate by Groups


In [None]:
# Make sure column are represented in integer type
titanic['Survived'] = titanic['Survived'].astype(int)

In [None]:
survival_rate_by_gender = titanic.groupby('Sex')['Survived'].mean()
print(survival_rate_by_gender)

In [None]:
survival_rate_by_class = titanic.groupby('Pclass')['Survived'].mean()
print(survival_rate_by_class)

# Step 7: Data Visualization

We'll use matplotlib to create some basic visualizations.

In [None]:
# Histogram of Ages
plt.hist(titanic['Age'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Distribution of Ages on the Titanic')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Survival Rate by Gender (Bar Chart)
survival_rate_by_gender.plot(kind='bar', color=['blue', 'orange'])
plt.title('Survival Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Survival Rate')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Survival Rate by Class (Bar Chart)
survival_rate_by_class.plot(kind='bar', color='green')
plt.title('Survival Rate by Class')
plt.xlabel('Class')
plt.ylabel('Survival Rate')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Age Distribution by Survival Status (Box Plot)
titanic.boxplot(column='Age', by='Survived', grid=False)
plt.title('Age Distribution by Survival Status')
plt.suptitle('')  # Suppress the default title to only show the custom title
plt.xlabel('Survived')
plt.ylabel('Age')
plt.show()

# Step 8: Age prediction

Use statistical model to predict passenger's age. 

'Dummy variables' have to be calculated for all categorial variables. They must be converted to `(1, 0)`.

In [None]:
titantic_dummified = pd.get_dummies(titanic, columns=['Sex', 'Embarked', 'Pclass'], drop_first=True)

# View the dummified dataset
titantic_dummified

In [None]:
# Construct two dataframes

# X is for the feature data, also known as independent variables.
features = ['Pclass_2', 'Pclass_3', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']
X = titantic_dummified[features]

# The Y dataframe has only one column, this is the target variable which we want to predict.
Y = titantic_dummified['Age']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Split the dataset into two parts: 
# 80% -> train
# 20% -> test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Load up the Linear Regression model
model = LinearRegression()

In [None]:
# Train the model
model.fit(X_train, Y_train)

In [None]:
# Model has been trained, now make a prediction
yhat = model.predict(X_test)

In [None]:
mse = mean_squared_error(Y_test, yhat)
r2 = r2_score(Y_test, yhat)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

In [None]:
plt.scatter(Y_test, yhat, alpha=0.7)
plt.xlabel("Actual Age")
plt.ylabel("Predicted Age")
plt.title("Actual vs Predicted Age")
plt.show()

In [None]:
def predict_age(model, passenger_details):
    """
    Predict the age of an arbitrary passenger using the trained model.
    
    Parameters:
    - model: Trained linear regression model.
    - passenger_details: Dictionary containing passenger details.
    
    Returns:
    - Predicted age.
    """
    # Create a DataFrame from the passenger details
    passenger_df = pd.DataFrame([passenger_details])
    
    # Ensure all required columns are present
    required_columns = ['Pclass_2', 'Pclass_3', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']
    for col in required_columns:
        if col not in passenger_df:
            passenger_df[col] = 0
    
    # Reorder columns to match the training data
    passenger_df = passenger_df[required_columns]
    
    # Predict age
    predicted_age = model.predict(passenger_df)
    return predicted_age[0]


In [None]:
# Sample passenger details
sample_passenger = {
    'Pclass_2': True,
    'Pclass_3': False,
    'SibSp': 0,
    'Parch': 0,
    'Fare': 7.25,
    'Sex_male': 1,
    'Embarked_Q': 0,
    'Embarked_S': 1
}

# Predict the age of the sample passenger
predicted_age = predict_age(model, sample_passenger)
print(f"Predicted Age: {predicted_age}")