### Importing Libraries

In [67]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

### Reading Dataset

In [68]:
import sqlite3
import pandas as pd

conn1 = sqlite3.connect('Data/cruise_pre.db')
df1 = pd.read_sql_query("SELECT * FROM cruise_pre", conn1)  
conn1.close()

conn2 = sqlite3.connect('Data/cruise_post.db')
df2 = pd.read_sql_query("SELECT * FROM cruise_post", conn2)  
conn2.close()


### Data Cleaning for EDA

In [69]:
# Combining Both Datasets Because Both have same rows
combined_df = pd.concat([df1, df2], axis = 1)
combined_df = combined_df.drop(['index', 'index'], axis = 1)

In [70]:
# Saving CSV for future Use
combined_df.to_csv('Data/ShipSail_Pandas.csv', index=False)

In [71]:
# Checking Null values
combined_df.isnull().sum()

Gender                                        13456
Date of Birth                                 14684
Source of Traffic                                 0
Onboard Wifi Service                          19492
Embarkation/Disembarkation time convenient    15643
Ease of Online booking                        18355
Gate location                                 17140
Logging                                           0
Onboard Dining Service                        16809
Online Check-in                               15687
Cabin Comfort                                 20252
Onboard Entertainment                         15913
Cabin service                                 13832
Baggage handling                              20293
Port Check-in Service                         14347
Onboard Service                               18327
Cleanliness                                   18402
Ext_Intcode                                       0
Cruise Name                                   15931
Ticket Type 

In [72]:
# Dropping Useless Columns
combined_df = combined_df.drop(['Ext_Intcode', 'Ext_Intcode', 'Ext_Intcode', 'Ext_Intcode'], axis = 1)

In [73]:
# Parsing Distance Column
import re
def clean_cruise_distance(distance):
    if isinstance(distance, str):
        numeric_value = re.search(r'\d+', distance)
        if numeric_value:
            return float(numeric_value.group())
    return None
combined_df['Cruise Distance'] = combined_df['Cruise Distance'].apply(clean_cruise_distance)
combined_df['Cruise Distance'] = combined_df['Cruise Distance'].abs()


In [74]:
# Separating Object and Int64/float64 Columns
Num_cols = list(combined_df.select_dtypes(['int64', 'float64']).columns)

In [75]:
# Null Values Imputation
def impute_null_with_mean(df, columns):
    for column in columns:
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)

def impute_null_with_mode(df):
    for column in df.select_dtypes(include=['object']):
        mode_value = df[column].mode()[0]
        df[column].fillna(mode_value, inplace=True)

impute_null_with_mode(combined_df)
impute_null_with_mean(combined_df, Num_cols)

combined_df.isnull().sum()


Gender                                        0
Date of Birth                                 0
Source of Traffic                             0
Onboard Wifi Service                          0
Embarkation/Disembarkation time convenient    0
Ease of Online booking                        0
Gate location                                 0
Logging                                       0
Onboard Dining Service                        0
Online Check-in                               0
Cabin Comfort                                 0
Onboard Entertainment                         0
Cabin service                                 0
Baggage handling                              0
Port Check-in Service                         0
Onboard Service                               0
Cleanliness                                   0
Cruise Name                                   0
Ticket Type                                   0
Cruise Distance                               0
WiFi                                    

### Exploratory Data Analysis