AIM #1: Loading the dataset and printing basic information 
1. Import the Titanic dataset using pandas
2. Create a Dataframe from the dataset
3. Print the first 10 rows of the dataset
4. Print the last 20 rows of the dataset
5. Print dataset's information
6. Describe the dataset
7. Make sure all the information returned by the different functions are displayed in a single table and not on multiple ines

In [None]:
import pandas as pd
import io

# 1. Import the Titanic dataset from a local CSV file
csv_file_path = '.\\titanic.csv'
df = pd.read_csv(csv_file_path)

# 2. Print the first 10 rows of the dataset
first_10_rows = df.head(10)
print("=== First 10 Rows ===")
print(first_10_rows.to_string())

# 3. Print the last 20 rows of the dataset
last_20_rows = df.tail(20)
print("\n=== Last 20 Rows ===")
print(last_20_rows.to_string())

# 4. Print dataset's information
info_buffer = io.StringIO()
df.info(buf=info_buffer)
info_str = info_buffer.getvalue()
print("\n=== Dataset Information ===")
print(info_str)

# 5. Describe the dataset
description = df.describe(include='all')
print("=== Dataset Description ===")
print(description.to_string())


AIM #2: Finding issues (empty, NAs, incorrect value, incorrect format, outliers, etc.) 
1. Find out how many missing values there are in the dataset
2. For the 'Age' column, find the best way to handle the missing values
    2.1. Use an appropriate plot to study the nature of the 'Age' column
    2.2. Figure out what is the best way to calculate the central tendency of the 'Age' column based on the above plot
    2.3. Using the most suitable central tendency measure, fill the missing values in the age column
3. Decide what is the best way to handle the missing values in the 'Cabin' columns
4. Similarly, decide what is the best way to handle the missing values in the 'Embarked' columns
5. Handle the incorrect data under the 'Survived' columns using appropriate measure
6. Handle the incorrectly formatted data under the 'Fare' column


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

csv_file_path = '.\\titanic.csv'
df = pd.read_csv(csv_file_path)

# 1. Find missing values
missing_values = df.isnull().sum()
print("=== Missing Values in Each Column ===")
print(missing_values)

# 2. Handle missing 'Age' values
# 2.1 Plot 'Age' distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df['Age'], kde=True, bins=30, color='skyblue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.boxplot(x=df['Age'], color='lightgreen')
plt.title('Age Boxplot')
plt.xlabel('Age')

plt.tight_layout()
plt.show()

# 2.2 Determine central tendency
skewness = df['Age'].skew()
print(f"\nSkewness of Age: {skewness:.2f}")

# 2.3 Fill missing 'Age' with mean
mean_age = df['Age'].mean()
print(f"Mean Age: {mean_age:.2f}")

df['Age'].fillna(mean_age, inplace=True)

# 3. Handle missing 'Cabin' values
cabin_missing_percentage = df['Cabin'].isnull().sum() / len(df) * 100
print(f"\nMissing Percentage in 'Cabin': {cabin_missing_percentage:.2f}%")

df.drop('Cabin', axis=1, inplace=True)
print("\nDropped 'Cabin' column.")

# 4. Handle missing 'Embarked' values
embarked_missing = df['Embarked'].isnull().sum()
print(f"\nMissing Values in 'Embarked': {embarked_missing}")

mode_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(mode_embarked, inplace=True)
print(f"Filled missing 'Embarked' values with mode: {mode_embarked}")

# 5. Verify 'Survived' column
unique_survived = df['Survived'].unique()
print(f"\nUnique values in 'Survived': {unique_survived}")

# 6. Ensure 'Fare' is numeric
df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')

fare_missing = df['Fare'].isnull().sum()
print(f"\nMissing values in 'Fare' after coercion: {fare_missing}")

if fare_missing > 0:
    median_fare = df['Fare'].median()
    df['Fare'].fillna(median_fare, inplace=True)
    print(f"Filled missing 'Fare' values with median: {median_fare:.2f}")
else:
    print("All 'Fare' values are correctly formatted.")

# Final verification
print("\n=== Remaining Missing Values ===")
print(df.isnull().sum())


AIM #3: Grouping 
1. Find out the average fare grouped by Pclass
    1.1. Plot the above using a suitable plot
2. Find out the average fare grouped by Sex
    2.1. Plot the above using a suitable plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('titanic.csv')

print("Original data type of 'Fare' column:", df['Fare'].dtype)
print("First 10 rows of the original 'Fare' column:")
print(df['Fare'].head(10))

df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')

print("Data type of 'Fare' column after conversion:", df['Fare'].dtype)
print("First 10 rows of the 'Fare' column after conversion:")
print(df['Fare'].head(10))

df = df.dropna(subset=['Fare'])

# 1. Group by Pclass and calculate the average fare
average_fare_by_pclass = df.groupby('Pclass')['Fare'].mean()
print("\nAverage fare by Pclass:")
print(average_fare_by_pclass)

# 1.1 Plot a bar chart
average_fare_by_pclass.plot(kind='bar', color='skyblue')
plt.title('Average Fare by Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('Average Fare')
plt.xticks(rotation=0)
plt.show()

# 2. Group by Sex and calculate the average fare
average_fare_by_sex = df.groupby('Sex')['Fare'].mean()
print("\nAverage fare by Sex:")
print(average_fare_by_sex)

# 2.1 Plot a bar chart
average_fare_by_sex.plot(kind='bar', color='salmon')
plt.title('Average Fare by Gender')
plt.xlabel('Gender')
plt.ylabel('Average Fare')
plt.xticks(rotation=0)
plt.show()


AIM #4: Dataset visualization using pandas

1. Plot the distribution of 'Age' using a suitable plot
2. Plot the distribution of 'Fare' using a suitable plot
3. Plot the distribution of 'Pclass' using a suitable plot
4. Plot the distribution of 'Survived' using a suitable plot
5. Plot the distribution of 'Embarked' using a suitable plot
6. Plot the distribution of 'Fare' grouped by 'Survived'
7. Plot the distribution of 'Fare' grouped by 'Pclass'
8. Plot the distribution of 'Age' grouped by 'Survived'
9. Plot the distribution of 'Age' grouped by 'PClass'
10. Combine the 'SibSp' and 'Parch' and plot its distribution grouped by 'Survived'
11. Combine the 'SibSp' and 'Parch' and plot its distribution grouped by 'Pclass'
12. Plot a distribution between 'Age' and 'Fare' to see if there's any relationship
13. Are there any other possibilities to show relationships?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('titanic.csv')

# 1. Plot the distribution of 'Age'
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'].dropna(), kde=True, bins=30)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# 2. Plot the distribution of 'Fare'
plt.figure(figsize=(10, 6))
sns.histplot(df['Fare'], kde=True, bins=30)
plt.title('Distribution of Fare')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

# 3. Plot the distribution of 'Pclass'
plt.figure(figsize=(10, 6))
sns.countplot(df['Pclass'])
plt.title('Distribution of Pclass')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.show()

# 4. Plot the distribution of 'Survived'
plt.figure(figsize=(10, 6))
sns.countplot(df['Survived'])
plt.title('Distribution of Survived')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()

# 5. Plot the distribution of 'Embarked'
plt.figure(figsize=(10, 6))
sns.countplot(df['Embarked'].dropna())
plt.title('Distribution of Embarked')
plt.xlabel('Embarked')
plt.ylabel('Count')
plt.show()

# 6. Plot the distribution of 'Fare' grouped by 'Survived'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Survived', y='Fare', data=df)
plt.title('Distribution of Fare grouped by Survived')
plt.xlabel('Survived')
plt.ylabel('Fare')
plt.show()

# 7. Plot the distribution of 'Fare' grouped by 'Pclass'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Pclass', y='Fare', data=df)
plt.title('Distribution of Fare grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Fare')
plt.show()

# 8. Plot the distribution of 'Age' grouped by 'Survived'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Survived', y='Age', data=df)
plt.title('Distribution of Age grouped by Survived')
plt.xlabel('Survived')
plt.ylabel('Age')
plt.show()

# 9. Plot the distribution of 'Age' grouped by 'Pclass'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Pclass', y='Age', data=df)
plt.title('Distribution of Age grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Age')
plt.show()

# 10. Combine the 'SibSp' and 'Parch' and plot its distribution grouped by 'Survived'
df['Family_Size'] = df['SibSp'] + df['Parch']

plt.figure(figsize=(10, 6))
sns.boxplot(x='Survived', y='Family_Size', data=df)
plt.title('Family Size grouped by Survived')
plt.xlabel('Survived')
plt.ylabel('Family Size (SibSp + Parch)')
plt.show()

# 11. Combine the 'SibSp' and 'Parch' and plot its distribution grouped by 'Pclass'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Pclass', y='Family_Size', data=df)
plt.title('Family Size grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Family Size (SibSp + Parch)')
plt.show()

# 12. Plot a distribution between 'Age' and 'Fare' to see if there's any relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Fare', data=df)
plt.title('Scatter plot of Age vs Fare')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()

# 13. Additional Possibilities: Correlation heatmap to see relationships between variables
numeric_df = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


AIM #5: Correlation

1. Generate a correlation matrix for the entire dataset
2. Find correlation between 'Age' and 'Fare'
3. What other possible correlations can be found in the dataset?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("Columns in the dataset:", df.columns)
print("Data types of columns:\n", df.dtypes)

if 'Fare' in df.columns and 'Age' in df.columns:
    df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
else:
    print("Columns 'Fare' or 'Age' not found in the dataset.")

numeric_df = df.select_dtypes(include=['float64', 'int64'])

correlation_matrix = numeric_df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix for Numeric Features')
plt.show()

if 'Fare' in numeric_df.columns and 'Age' in numeric_df.columns:
    age_fare_correlation = numeric_df['Age'].corr(numeric_df['Fare'])
    print(f"Correlation between Age and Fare: {age_fare_correlation}")
else:
    print("Either 'Age' or 'Fare' is not present in the numeric DataFrame.")

correlation_pairs = correlation_matrix.unstack().sort_values(ascending=False)
strongest_correlations = correlation_pairs[correlation_pairs < 1].drop_duplicates().nlargest(5)
print("Top 5 strongest correlations in the dataset:")
print(strongest_correlations)
