In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# --- Step 1: Data Acquisition ---
print("--- Step 1: Data Acquisition ---")
# Load the inbuilt Titanic dataset
df = sns.load_dataset('titanic')

print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Summary (info()):")
df.info()
print("-" * 50)


# --- Step 2: Handling Missing Values ---
print("--- Step 2: Handling Missing Values ---")

# Identify missing values before handling
print("\nMissing values before handling:")
print(df.isnull().sum())

# Fill missing Age with median value
df['age'] = df['age'].fillna(df['age'].median())

# Fill missing Embarked (using 'embark_town' which has the missing values) with mode
mode_embark_town = df['embark_town'].mode()[0]
df['embark_town'] = df['embark_town'].fillna(mode_embark_town)
# The 'embarked' column (S, C, Q) is also filled indirectly or by the preceding 'age' fill.

# Drop the Cabin column (which is named 'deck' in the seaborn dataset)
# Check if 'deck' exists before dropping to avoid KeyError
if 'deck' in df.columns:
    df.drop('deck', axis=1, inplace=True)
else:
    print("\n'deck' (Cabin) column not found or already dropped.")

print("\nMissing values after Step 2:")
print(df.isnull().sum())
print("-" * 50)


# --- Step 3: Handling Outliers in Fare ---
print("--- Step 3: Handling Outliers in Fare ---")

# Detect outliers in Fare using the IQR method
Q1 = df['fare'].quantile(0.25)
Q3 = df['fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Replace extreme Fare values with capping (winsorizing)
df['fare'] = np.where(df['fare'] > upper_bound, upper_bound, df['fare'])
df['fare'] = np.where(df['fare'] < lower_bound, lower_bound, df['fare'])

print("Fare statistics after outlier handling (IQR method applied):")
print(df['fare'].describe())
print("-" * 50)


# --- Step 4: Feature Engineering ---
print("--- Step 4: Feature Engineering ---")

# FIX: Extract Title from the 'name' column (not 'who', which caused the IndexError)
# Format is 'LastName, Title. FirstName'
# The 'name' column was dropped earlier, so this step is removed
# df['title'] = df['name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())


# Create FamilySize = SibSp + Parch + 1
df['familysize'] = df['sibsp'] + df['parch'] + 1

# Create IsAlone = 1 if FamilySize = 1, else 0
df['isalone'] = np.where(df['familysize'] == 1, 1, 0)

# Print engineered features, excluding 'name' as it was dropped
print("Engineered features (first 5 rows):")
print(df[['familysize', 'isalone']].head()) # 'title' is not created here
print("-" * 50)


# --- Step 5: Categorical Encoding and Column Dropping ---


--- Step 1: Data Acquisition ---
First 5 rows of the dataset:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Dataset Summary (info()):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Nul

In [None]:
print("--- Step 5: Categorical Encoding and Column Dropping ---")

columns_to_drop = ['name', 'embarked', 'class', 'who', 'adult_male', 'alive', 'alone', 'ticket', 'title']
df.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

print("DataFrame columns after encoding and dropping:")
print(df.columns.tolist())
print("-" * 50)

--- Step 5: Categorical Encoding and Column Dropping ---
DataFrame columns after encoding and dropping:
['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'familysize', 'isalone', 'sex_male', 'embarked_Queenstown', 'embarked_Southampton']
--------------------------------------------------


In [None]:
print("--- Step 6: Dimensionality Reduction (PCA) ---")

pca_features = ['age', 'fare', 'familysize']

X_pca = df[pca_features].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_pca)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(data = principal_components,
                      columns = ['principal_component_1', 'principal_component_2'])

print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_.sum():.2f}")
print("New PCA feature structure (first 5 rows):")
print(pca_df.head())

df.drop(pca_features, axis=1, inplace=True)
print("-" * 50)

--- Step 6: Dimensionality Reduction (PCA) ---
PCA Explained Variance Ratio: 0.84
New PCA feature structure (first 5 rows):
   principal_component_1  principal_component_2
0              -0.332650              -0.904528
1               1.152268               1.582672
2              -0.847205              -0.532609
3               0.825513               1.077084
4              -1.008120               0.057714
--------------------------------------------------


In [None]:
print("--- Step 7: Data Cleaning ---")

initial_rows = len(df)
df.drop_duplicates(inplace=True)
print(f"Removed {initial_rows - len(df)} duplicate rows.")

if df.isnull().sum().sum() == 0:
    print("Data Cleaning: No missing values remain in the processed DataFrame.")
else:
    print(f"WARNING: {df.isnull().sum().sum()} missing values found.")

print("Column types (dtypes):")
print(df.dtypes)
print("-" * 50)

--- Step 7: Data Cleaning ---
Removed 731 duplicate rows.
Data Cleaning: No missing values remain in the processed DataFrame.
Column types (dtypes):
survived                int64
pclass                  int64
sibsp                   int64
parch                   int64
isalone                 int64
sex_male                 bool
embarked_Queenstown      bool
embarked_Southampton     bool
dtype: object
--------------------------------------------------


In [None]:
print("--- Step 8: Final Dataset ---")

df.reset_index(drop=True, inplace=True)
final_df = pd.concat([df, pca_df], axis=1)

y = final_df['survived']
X = final_df.drop('survived', axis=1)

print("Final Feature Matrix (X) - first 5 rows:")
print(X.head())
print(f"\nShape of the final feature matrix X: {X.shape}")
print(f"Shape of the target vector y: {y.shape}")
print("-" * 50)

--- Step 8: Final Dataset ---
Final Feature Matrix (X) - first 5 rows:
   pclass  sibsp  parch  isalone sex_male embarked_Queenstown  \
0     3.0    1.0    0.0      0.0     True               False   
1     1.0    1.0    0.0      0.0    False               False   
2     3.0    0.0    0.0      1.0    False               False   
3     1.0    1.0    0.0      0.0    False               False   
4     3.0    0.0    0.0      1.0     True               False   

  embarked_Southampton  principal_component_1  principal_component_2  
0                 True              -0.332650              -0.904528  
1                False               1.152268               1.582672  
2                 True              -0.847205              -0.532609  
3                 True               0.825513               1.077084  
4                 True              -1.008120               0.057714  

Shape of the final feature matrix X: (891, 9)
Shape of the target vector y: (891,)
----------------------------

In [None]:
import numpy as np
from scipy.linalg import lu

print("LU Decomposition ")

A = np.array([[4, 3],
              [6, 3]])

P, L, U = lu(A)

print("Matrix A:\n", A)
print("\nPermutation matrix (P):\n", P)
print("\nLower triangular matrix (L):\n", L)
print("\nUpper triangular matrix (U):\n", U)

A_reconstructed = P @ L @ U
print("\nReconstructed A (P @ L @ U):\n", A_reconstructed)


LU Decomposition 
Matrix A:
 [[4 3]
 [6 3]]

Permutation matrix (P):
 [[0. 1.]
 [1. 0.]]

Lower triangular matrix (L):
 [[1.         0.        ]
 [0.66666667 1.        ]]

Upper triangular matrix (U):
 [[6. 3.]
 [0. 1.]]

Reconstructed A (P @ L @ U):
 [[4. 3.]
 [6. 3.]]


In [None]:
import numpy as np

print("\n QR Decomposition ")

A = np.array([[1, 1, 0],
              [1, 0, 1],
              [0, 1, 1]], dtype=float)

Q, R = np.linalg.qr(A)

print("Matrix A:\n", A)
print("\nOrthogonal matrix (Q):\n", Q)
print("\nUpper triangular matrix (R):\n", R)

A_reconstructed = Q @ R
print("\nReconstructed A (Q @ R):\n", A_reconstructed)



 QR Decomposition 
Matrix A:
 [[1. 1. 0.]
 [1. 0. 1.]
 [0. 1. 1.]]

Orthogonal matrix (Q):
 [[-0.70710678  0.40824829 -0.57735027]
 [-0.70710678 -0.40824829  0.57735027]
 [-0.          0.81649658  0.57735027]]

Upper triangular matrix (R):
 [[-1.41421356 -0.70710678 -0.70710678]
 [ 0.          1.22474487  0.40824829]
 [ 0.          0.          1.15470054]]

Reconstructed A (Q @ R):
 [[ 1.00000000e+00  1.00000000e+00 -1.98977135e-16]
 [ 1.00000000e+00 -1.78835871e-16  1.00000000e+00]
 [ 0.00000000e+00  1.00000000e+00  1.00000000e+00]]
