In [12]:
import os 
import sys
import pandas as pd

In [13]:
import pandas as pd
import seaborn as sns
import os

def load_titanic():
    """
    Try to load Titanic dataset using seaborn.
    If that fails, load from local CSV.
    If both fail, raise error.
    """
    try:
        print("Trying seaborn Titanic dataset...")
        df = sns.load_dataset('titanic')
        print("Loaded Titanic via seaborn!")
        return df

    except Exception as e:
        print("Seaborn load failed:", str(e))

        local_csv = "/Users/vidhanmanihar/Desktop/untitled folder/DSML/Practical/one/tested.csv"
        
        if os.path.exists(local_csv):
            print(f"Reading from local file: {local_csv}")
            return pd.read_csv(local_csv)

        else:
            raise FileNotFoundError(
                "Could not load Titanic dataset from seaborn or local CSV.\n"
                "Check internet OR save Titanic CSV at:\n"
                f"{local_csv}"
            )


In [14]:
df = load_titanic()
print(df.head())


Trying seaborn Titanic dataset...
Loaded Titanic via seaborn!
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [23]:
def main():
    # Load dataset
    df = load_titanic()

    # Create output directory
    out_dir = './output'
    os.makedirs(out_dir, exist_ok=True)

    # Save copies (CSV and Excel)
    csv_path = os.path.join(out_dir, 'titanic_saved.csv')
    xlsx_path = os.path.join(out_dir, 'titanic_saved.xlsx')
    df.to_csv(csv_path, index=False)
    # For writing Excel ensure openpyxl is installed: pip install openpyxl
    try:
        df.to_excel(xlsx_path, index=False)
    except Exception as e:
        print("Excel write failed (openpyxl missing?). You can pip install openpyxl to enable Excel output.")
        xlsx_path = None

    # Basic info
    print("\n=== Basic Info ===")
    print("Shape (rows, columns):", df.shape)
    print("Columns:", df.columns.tolist())
    print("Number of missing entries per column:")
    print(df.isna().sum())

    # Data types
    print("\n=== Data types ===")
    print(df.dtypes)

    # Descriptive statistics
    print("\n=== Describe (numeric columns) ===")
    print(df.describe())

    print("\n=== Describe (all columns) ===")
    print(df.describe(include='all'))

    # Indexing & selecting examples
    print("\n=== Indexing & selection examples ===")
    # Single column (Series)
    print("survived column sample:")
    print(df['survived'].head())

    # Multiple columns (DataFrame)
    print("\nSubset columns (survived, pclass, sex, age) sample:")
    print(df[['survived', 'pclass', 'sex', 'age']].head())

    # Row selection by iloc
    print("\nFirst row (iloc[0]):")
    print(df.iloc[0])

    # Slicing rows
    print("\nRows 0:5 (iloc):")
    print(df.iloc[0:5])

    # Boolean indexing: survived == 1
    survivors = df[df['survived'] == 1]
    print("\nNumber of survivors (survived == 1):", survivors.shape[0])

    # loc with condition and selected columns
    over_60 = df.loc[df['age'] > 60, ['survived', 'age', 'sex', 'pclass']]
    print("\nPassengers with age > 60 (loc selection):")
    print(over_60)

    # Sorting
    print("\n=== Sorting examples ===")
    print("Top 10 oldest passengers (age desc):")
    print(df.sort_values(by='age', ascending=False).head(10)[['age','survived','sex','pclass']])

    print("\nTop 10 highest fares (fare desc):")
    print(df.sort_values(by='fare', ascending=False).head(10)[['fare','survived','pclass','sex']])

    print("\nSorted by pclass ascending then fare descending (top 10):")
    print(df.sort_values(by=['pclass','fare'], ascending=[True, False]).head(10)[['pclass','fare','survived']])

    # Summary CSV
    summary = {
        'rows': df.shape[0],
        'columns': df.shape[1],
        'total_missing_values': int(df.isna().sum().sum()),
        'columns_with_missing_values': int((df.isna().sum() > 0).sum())
    }
    summary_df = pd.DataFrame(list(summary.items()), columns=['metric', 'value'])
    summary_path = os.path.join(out_dir, 'titanic_summary.csv')
    summary_df.to_csv(summary_path, index=False)
    print("\nSummary saved to:", summary_path)
    print("CSV saved to:", csv_path)
    if xlsx_path:
        print("Excel saved to:", xlsx_path)

    # Example: select a sample for display / teaching
    sample_display = df.head(20)
    print("\nSample (first 20 rows):")
    print(sample_display)

if __name__ == '__main__':
    main()

Trying seaborn Titanic dataset...
Loaded Titanic via seaborn!

=== Basic Info ===
Shape (rows, columns): (891, 15)
Columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']
Number of missing entries per column:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

=== Data types ===
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object
