### Importing Libraries

In [82]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

### Reading Dataset

In [83]:
import sqlite3
import pandas as pd

conn1 = sqlite3.connect('Data/cruise_pre.db')
df1 = pd.read_sql_query("SELECT * FROM cruise_pre", conn1)  
conn1.close()

conn2 = sqlite3.connect('Data/cruise_post.db')
df2 = pd.read_sql_query("SELECT * FROM cruise_post", conn2)  
conn2.close()


### Data Cleaning for EDA

In [84]:
# Combining Both Datasets Because Both have same rows
combined_df = pd.concat([df1, df2], axis = 1)
combined_df = combined_df.drop(['index', 'index'], axis = 1)

In [85]:
# Saving CSV for future Use
combined_df.to_csv('Data/ShipSail_Pandas.csv', index=False)

In [86]:
# Checking Null values
combined_df.isnull().sum()

Gender                                        13456
Date of Birth                                 14684
Source of Traffic                                 0
Onboard Wifi Service                          19492
Embarkation/Disembarkation time convenient    15643
Ease of Online booking                        18355
Gate location                                 17140
Logging                                           0
Onboard Dining Service                        16809
Online Check-in                               15687
Cabin Comfort                                 20252
Onboard Entertainment                         15913
Cabin service                                 13832
Baggage handling                              20293
Port Check-in Service                         14347
Onboard Service                               18327
Cleanliness                                   18402
Ext_Intcode                                       0
Cruise Name                                   15931
Ticket Type 

In [87]:
# Dropping Useless Columns
combined_df = combined_df.drop(['Ext_Intcode', 'Ext_Intcode', 'Ext_Intcode', 'Ext_Intcode'], axis = 1)

In [88]:
# Parsing Distance Column
import re
def clean_cruise_distance(distance):
    if isinstance(distance, str):
        numeric_value = re.search(r'\d+', distance)
        if numeric_value:
            return float(numeric_value.group())
    return None
combined_df['Cruise Distance'] = combined_df['Cruise Distance'].apply(clean_cruise_distance)
combined_df['Cruise Distance'] = combined_df['Cruise Distance'].abs()


In [89]:
# Separating Object and Int64/float64 Columns
Num_cols = list(combined_df.select_dtypes(['int64', 'float64']).columns)

In [90]:
# Null Values Imputation
def impute_null_with_mean(df, columns):
    for column in columns:
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)

def impute_null_with_mode(df):
    for column in df.select_dtypes(include=['object']):
        mode_value = df[column].mode()[0]
        df[column].fillna(mode_value, inplace=True)

impute_null_with_mode(combined_df)
impute_null_with_mean(combined_df, Num_cols)

combined_df.isnull().sum()


Gender                                        0
Date of Birth                                 0
Source of Traffic                             0
Onboard Wifi Service                          0
Embarkation/Disembarkation time convenient    0
Ease of Online booking                        0
Gate location                                 0
Logging                                       0
Onboard Dining Service                        0
Online Check-in                               0
Cabin Comfort                                 0
Onboard Entertainment                         0
Cabin service                                 0
Baggage handling                              0
Port Check-in Service                         0
Onboard Service                               0
Cleanliness                                   0
Cruise Name                                   0
Ticket Type                                   0
Cruise Distance                               0
WiFi                                    

### Exploratory Data Analysis

In [92]:
combined_df = combined_df.drop(['Logging', 'Date of Birth'], axis = 1)

In [95]:
Obj_cols = list(combined_df.select_dtypes('object').columns)

In [99]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in Obj_cols:
    combined_df[f'{col}'] = le.fit_transform(combined_df[f'{col}'])

combined_df

Unnamed: 0,Gender,Source of Traffic,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Onboard Dining Service,Online Check-in,Cabin Comfort,Onboard Entertainment,...,Baggage handling,Port Check-in Service,Onboard Service,Cleanliness,Cruise Name,Ticket Type,Cruise Distance,WiFi,Dining,Entertainment
0,0,0,0,3.00000,5.00000,3.000000,4,2.000000,2.000000,0,...,2.000000,4.00000,2.0,3.000000,0,1,3567.0,1.000000,1,1.000000
1,0,3,2,4.00000,1.00000,2.978526,4,3.249884,4.000000,4,...,3.000000,4.00000,4.0,4.000000,0,0,672.0,0.500578,0,1.000000
2,0,2,0,3.00000,0.00000,5.000000,4,3.249884,5.000000,1,...,1.000000,2.00000,3.0,3.285745,1,0,1167.0,0.500578,0,0.000000
3,0,0,4,4.00000,4.00000,4.000000,3,4.000000,4.000000,4,...,3.633416,3.00000,2.0,4.000000,2,0,280.0,0.500578,0,1.000000
4,1,0,3,4.00000,2.00000,2.978526,2,2.000000,3.439574,2,...,3.633416,5.00000,2.0,3.285745,2,2,1145.0,0.500578,1,0.503142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133741,0,2,0,1.00000,1.00000,4.000000,4,2.000000,3.439574,4,...,3.633416,3.00000,5.0,3.285745,0,2,1506.0,0.500578,1,0.503142
133742,0,1,0,2.00000,2.75817,2.000000,3,3.000000,2.000000,0,...,2.000000,3.00000,2.0,3.285745,0,1,240.0,1.000000,0,1.000000
133743,1,1,1,5.00000,5.00000,5.000000,1,5.000000,5.000000,1,...,5.000000,4.00000,4.0,5.000000,0,1,1947.0,1.000000,0,1.000000
133744,0,2,0,1.00000,1.00000,4.000000,4,2.000000,4.000000,4,...,4.000000,3.00000,5.0,4.000000,0,2,1506.0,0.500578,1,0.503142


In [102]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

X = combined_df.drop(columns=['Ticket Type'])  
y = combined_df['Ticket Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42, verbose=True)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score:", accuracy)


Iteration 1, loss = 0.39195171
Iteration 2, loss = 0.22576099
Iteration 3, loss = 0.22256426
Iteration 4, loss = 0.22178216
Iteration 5, loss = 0.22067221
Iteration 6, loss = 0.22032885
Iteration 7, loss = 0.21938605
Iteration 8, loss = 0.21909831
Iteration 9, loss = 0.21839847
Iteration 10, loss = 0.21780663
Iteration 11, loss = 0.21695202
Iteration 12, loss = 0.21643401
Iteration 13, loss = 0.21596244
Iteration 14, loss = 0.21537731
Iteration 15, loss = 0.21475791
Iteration 16, loss = 0.21405536
Iteration 17, loss = 0.21385599
Iteration 18, loss = 0.21288267
Iteration 19, loss = 0.21255071
Iteration 20, loss = 0.21179610
Iteration 21, loss = 0.21118233
Iteration 22, loss = 0.21072292
Iteration 23, loss = 0.21000658
Iteration 24, loss = 0.20936679
Iteration 25, loss = 0.20894632
Iteration 26, loss = 0.20807333
Iteration 27, loss = 0.20780389
Iteration 28, loss = 0.20706823
Iteration 29, loss = 0.20656676
Iteration 30, loss = 0.20613978
Iteration 31, loss = 0.20557195
Iteration 32, los



In [100]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Assuming combined_df is your combined dataframe with features and target column
# Assuming the target column is named 'Target'

# Splitting the data into features (X) and target (y)
X = combined_df.drop(columns=['Ticket Type'])  # Features
y = combined_df['Ticket Type']  # Target

# Splitting the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Predicting on the test set
y_pred = clf.predict(X_test)

# Calculating the accuracy score
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy score:", accuracy)


Accuracy score: 0.8559626168224299
