AIM #1: Loading the dataset and printing basic information 
1. Import the Titanic dataset using pandas
2. Create a Dataframe from the dataset
3. Print the first 10 rows of the dataset
4. Print the last 20 rows of the dataset
5. Print dataset's information
6. Describe the dataset
7. Make sure all the information returned by the different functions are displayed in a single table and not on multiple ines

In [None]:
import pandas as pd

# Load the dataset using pandas
titanic_data = pd.read_csv('/path/to/titanic.csv')  # Update the path if necessary

# Create a DataFrame
df = pd.DataFrame(titanic_data)

# Printing the first 10 rows
first_10_rows = df.head(10)

# Printing the last 20 rows
last_20_rows = df.tail(20)

# Printing dataset's information
dataset_info = df.info()

# Describing the dataset
dataset_description = df.describe()

# Displaying everything in a single format
print("First 10 rows:\n", first_10_rows)
print("\nLast 20 rows:\n", last_20_rows)
print("\nDataset information:")
print(dataset_info)
print("\nDataset description:\n", dataset_description)


AIM #2: Finding issues (empty, NAs, incorrect value, incorrect format, outliers, etc.) 
1. Find out how many missing values there are in the dataset
2. For the 'Age' column, find the best way to handle the missing values
    2.1. Use an appropriate plot to study the nature of the 'Age' column
    2.2. Figure out what is the best way to calculate the central tendency of the 'Age' column based on the above plot
    2.3. Using the most suitable central tendency measure, fill the missing values in the age column
3. Decide what is the best way to handle the missing values in the 'Cabin' columns
4. Similarly, decide what is the best way to handle the missing values in the 'Embarked' columns
5. Handle the incorrect data under the 'Survived' columns using appropriate measure
6. Handle the incorrectly formatted data under the 'Fare' column


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset using pandas
df = pd.read_csv('/path/to/titanic.csv')  

# 1. Finding missing values in the dataset
missing_values = df.isnull().sum()

# 2.1. Plotting the 'Age' column to study its distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'].dropna(), kde=True, bins=30)
plt.title('Age Distribution with KDE')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


print("Missing values before handling:\n", missing_values)


In [None]:
# 2.2. Decide the best central tendency measure for 'Age'
age_median = df['Age'].median()

# 2.3. Fill missing values in the 'Age' column with the median
df['Age'].fillna(age_median, inplace=True)

# 3. Handle missing values in the 'Cabin' column
df['Cabin'].fillna('Unknown', inplace=True)

# 4. Handle missing values in the 'Embarked' column
embarked_mode = df['Embarked'].mode()[0]
df['Embarked'].fillna(embarked_mode, inplace=True)

# 5. Handle incorrect data in the 'Survived' column
df['Survived'] = pd.to_numeric(df['Survived'], errors='coerce')
df['Survived'].fillna(df['Survived'].mode()[0], inplace=True)

# 6. Handle incorrectly formatted data in the 'Fare' column
df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')
fare_mean = df['Fare'].mean()
df['Fare'].fillna(fare_mean, inplace=True)

    
print("\nMissing values after handling:\n", df.isnull().sum())


AIM #3: Grouping 
1. Find out the average fare grouped by Pclass
    1.1. Plot the above using a suitable plot
2. Find out the average fare grouped by Sex
    2.1. Plot the above using a suitable plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1.1 Find the average fare grouped by Pclass
avg_fare_by_pclass = df.groupby('Pclass')['Fare'].mean()

# Plot the average fare by Pclass
plt.figure(figsize=(8, 6))
sns.barplot(x=avg_fare_by_pclass.index, y=avg_fare_by_pclass.values, palette='viridis')
plt.title('Average Fare by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Average Fare')
plt.show()

# Print the average fare grouped by Pclass
print("Average Fare by Pclass:\n", avg_fare_by_pclass)
# 2.1 Find the average fare grouped by Sex
avg_fare_by_sex = df.groupby('Sex')['Fare'].mean()

# Plot the average fare by Sex
plt.figure(figsize=(8, 6))
sns.barplot(x=avg_fare_by_sex.index, y=avg_fare_by_sex.values, palette='coolwarm')
plt.title('Average Fare by Sex')
plt.xlabel('Sex')
plt.ylabel('Average Fare')
plt.show()

# Print the average fare grouped by Sex
print("Average Fare by Sex:\n", avg_fare_by_sex)


AIM #4: Dataset visualization using pandas

1. Plot the distribution of 'Age' using a suitable plot
2. Plot the distribution of 'Fare' using a suitable plot
3. Plot the distribution of 'Pclass' using a suitable plot
4. Plot the distribution of 'Survived' using a suitable plot
5. Plot the distribution of 'Embarked' using a suitable plot
6. Plot the distribution of 'Fare' grouped by 'Survived'
7. Plot the distribution of 'Fare' grouped by 'Pclass'
8. Plot the distribution of 'Age' grouped by 'Survived'
9. Plot the distribution of 'Age' grouped by 'PClass'
10. Combine the 'SibSp' and 'Parch' and plot its distribution grouped by 'Survived'
11. Combine the 'SibSp' and 'Parch' and plot its distribution grouped by 'Pclass'
12. Plot a distribution between 'Age' and 'Fare' to see if there's any relationship
13. Are there any other possibilities to show relationships?

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset if not already loaded
# df = pd.read_csv('/path/to/titanic.csv')

plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")

# 1. Plot the distribution of 'Age'
plt.figure(figsize=(8, 6))
sns.histplot(df['Age'].dropna(), kde=True, bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# 2. Plot the distribution of 'Fare'
plt.figure(figsize=(8, 6))
sns.histplot(df['Fare'], kde=True, bins=30)
plt.title('Fare Distribution')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

# 3. Plot the distribution of 'Pclass'
plt.figure(figsize=(8, 6))
sns.countplot(x='Pclass', data=df)
plt.title('Pclass Distribution')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.show()

# 4. Plot the distribution of 'Survived'
plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', data=df)
plt.title('Survived Distribution')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()

# 5. Plot the distribution of 'Embarked'
plt.figure(figsize=(8, 6))
sns.countplot(x='Embarked', data=df)
plt.title('Embarked Distribution')
plt.xlabel('Embarked')
plt.ylabel('Count')
plt.show()

# 6. Plot the distribution of 'Fare' grouped by 'Survived'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Survived', y='Fare', data=df)
plt.title('Fare Distribution Grouped by Survived')
plt.xlabel('Survived')
plt.ylabel('Fare')
plt.show()

# 7. Plot the distribution of 'Fare' grouped by 'Pclass'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Pclass', y='Fare', data=df)
plt.title('Fare Distribution Grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Fare')
plt.show()

# 8. Plot the distribution of 'Age' grouped by 'Survived'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Survived', y='Age', data=df)
plt.title('Age Distribution Grouped by Survived')
plt.xlabel('Survived')
plt.ylabel('Age')
plt.show()

# 9. Plot the distribution of 'Age' grouped by 'Pclass'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Pclass', y='Age', data=df)
plt.title('Age Distribution Grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Age')
plt.show()

# 10. Combine 'SibSp' and 'Parch' and plot its distribution grouped by 'Survived'
df['FamilySize'] = df['SibSp'] + df['Parch']
plt.figure(figsize=(8, 6))
sns.boxplot(x='Survived', y='FamilySize', data=df)
plt.title('Family Size Distribution Grouped by Survived')
plt.xlabel('Survived')
plt.ylabel('Family Size (SibSp + Parch)')
plt.show()

# 11. Combine 'SibSp' and 'Parch' and plot its distribution grouped by 'Pclass'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Pclass', y='FamilySize', data=df)
plt.title('Family Size Distribution Grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Family Size (SibSp + Parch)')
plt.show()

# 12. Plot a distribution between 'Age' and 'Fare' to see if there's any relationship
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Age', y='Fare', data=df)
plt.title('Relationship Between Age and Fare')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()

# Are there any other possibilities to show relationships?
# For example, showing correlation between all numerical variables
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Between Numerical Variables')
plt.show()


AIM #5: Correlation

1. Generate a correlation matrix for the entire dataset
2. Find correlation between 'Age' and 'Fare'
3. What other possible correlations can be found in the dataset?

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset if not already loaded
# df = pd.read_csv('/path/to/titanic.csv')

# 1. Generate a correlation matrix for the entire dataset
correlation_matrix = df.corr()

# Plot the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix for Titanic Dataset')
plt.show()

# 2. Find correlation between 'Age' and 'Fare'
age_fare_correlation = df[['Age', 'Fare']].corr()

# Print the correlation between 'Age' and 'Fare'
print("Correlation between 'Age' and 'Fare':\n", age_fare_correlation)

# 3. What other possible correlations can be found in the dataset?
# Let's focus on correlations of the 'Survived' column with other variables
survived_correlations = df.corr()['Survived'].sort_values(ascending=False)

# Print correlations with 'Survived'
print("\nCorrelations with 'Survived':\n", survived_correlations)
