In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
from ydata_profiling import ProfileReport
from scipy import stats

In [None]:
data = pd.read_csv('data/train.csv', index_col=[0])

In [5]:
data.isna().mean()

HomePlanet      0.023122
CryoSleep       0.024963
Cabin           0.022892
Destination     0.020936
Age             0.020591
VIP             0.023352
RoomService     0.020821
FoodCourt       0.021051
ShoppingMall    0.023927
Spa             0.021051
VRDeck          0.021627
Name            0.023007
Transported     0.000000
dtype: float64

In [None]:
def UniquevaluesAnalysis(data, column):
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    count_labels = data[column].value_counts()

    plt.pie(count_labels, labels=count_labels.index, autopct='%.1f%%', shadow=True)
    plt.subplot(1, 2, 2)

    sns.countplot(data=data, x=column, palette='pastel', hue='Transported')
    plt.title(f'Bar Graph of {column} Counts')
    plt.tight_layout()
    plt.show()

In [None]:
univalue_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

In [None]:
for column in univalue_columns :
    UniquevaluesAnalysis(data, column)

In [11]:
def OutlierDetection(data, column):
    plt.figure(figsize=(16, 4))
    
    # Box Plot
    plt.subplot(1, 3, 1)
    plt.title(f"Box Plot of {column}")
    sns.boxplot(data[column])
    
    # Histogram
    plt.subplot(1, 3, 2)
    plt.title(f"Histogram of {column}")
    sns.histplot(data[column], kde=True)
    
    
    plt.tight_layout()
    plt.show()


num_col = data.select_dtypes(np.number).columns
for column in num_col :
    OutlierDetection(data, column)

KeyboardInterrupt: 

In [None]:
def BarGraphs(data, column):
    fig = px.histogram(data, x=column, color='Transported')
    fig.show()

In [None]:
BarGraphs(data, 'Age')

In [1]:
BarGraphs(data, 'Cabin')

NameError: name 'BarGraphs' is not defined

In [None]:
data.describe()

In [None]:
data.drop(['Name'], inplace=True, axis=1)

In [None]:
data.isna().sum()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler

def covertToProcessedData(data):
    data['Cabin'].fillna('Z/9999/Z', inplace=True)
    
    data['Cabin_deck'] = data['Cabin'].apply(lambda x: x.split('/')[0])
    data['Cabin_number'] = data['Cabin'].apply(lambda x: x.split('/')[1]).astype(int)
    data['Cabin_side'] = data['Cabin'].apply(lambda x: x.split('/')[2])
    
    data.drop(['Cabin'], axis=1, inplace=True)
    
    label_encoder = LabelEncoder()
    data['Cabin_deck'] = label_encoder.fit_transform(data['Cabin_deck'])
    data['Cabin_side'] = label_encoder.fit_transform(data['Cabin_side'])
    
    age_bins = [0, 12, 19, 35, 60, float('inf')]
    age_labels = ['Child', 'Teenager', 'Young Adult', 'Middle-Aged', 'Senior']
    
    data['Age_Groups'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels, right=False)
    
    data['NoSpending'] = (data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck'] == 0).astype(int)
    
    data['Expenditure'] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    
    data['Group'] = data.index.str.split('_').str[-1].astype(int)
    
    data['Solo'] = (data['Group'] == 1).astype(int)
    
    numerical_data = data.select_dtypes(np.number)
    categorical_data = data.select_dtypes(include=['object', 'category'])
    
    imputer = SimpleImputer(strategy='mean')
    imputed_numerical_data = imputer.fit_transform(numerical_data)
    imputed_numerical_df = pd.DataFrame(imputed_numerical_data, columns=numerical_data.columns)
    
    scaler = StandardScaler()
    scaled_numerical_data = scaler.fit_transform(imputed_numerical_df)
    scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=imputed_numerical_df.columns)
    
    encoder = OneHotEncoder()
    encoded_categorical_data = encoder.fit_transform(categorical_data).toarray()
    encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_data.columns))
    
    columns_to_keep = [col for col in encoded_categorical_df.columns if not col.endswith('_nan')]
    encoded_categorical_df = encoded_categorical_df[columns_to_keep]
    
    processed_data = pd.concat([scaled_numerical_df, encoded_categorical_df], axis=1)
    
    return processed_data


In [None]:


dataForInput = data.drop(['Transported'], axis=1)
inputs = covertToProcessedData(dataForInput)
targets = data['Transported']
inputs

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.25, random_state=42)

In [None]:
from sklearn.metrics import r2_score

In [None]:
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
import warnings
warnings.filterwarnings("ignore")

cat_params = {
    "verbose" : 0,
}

lgb_params = {
    "boosting_type":'gbdt',
    "learning_rate" : 0.1,
    "n_estimators":200,
    "objective":'binary',  
    "class_weight":'balanced', 
    "colsample_bytree":1,
    "subsample":1,
    "min_child_samples":10,
    "random_state":42,
    "n_jobs":-1
}

xgb_params = {
    "objective":'binary:logistic',
    "max_depth":6,  
    "learning_rate":0.2,  
    "n_estimators":200,  
    "subsample":1, 
    "colsample_bytree":1, 
    "gamma":0,  
    "random_state":42,
    "n_jobs":-1 
}


models = {
    "xgb" : XGBClassifier(**xgb_params),
    "lgb" : LGBMClassifier(**lgb_params),
    "cat" : CatBoostClassifier(**cat_params),
    "logic" : LogisticRegression(random_state=42, max_iter=1000)
}

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import warnings

colors = {
    'bold': '\033[1m',
    'red': '\033[91m',
    'green': '\033[92m',
    'purple': '\033[95m',
    'orange': '\033[93m',
    'yellow': '\033[93m',
    'reset': '\033[0m'  
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    warnings.filterwarnings("ignore")
    pred = model.predict(X_test)
    pred = pred.astype(bool)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred)
    
    print(f"{colors['bold']}Model: {model_name}{colors['reset']}")
    print(f"{colors['green']}Accuracy: {accuracy:.2f}{colors['reset']}")
    print(f"{colors['red']}Precision: {precision:.2f}{colors['reset']}")
    print(f"{colors['yellow']}Recall: {recall:.2f}{colors['reset']}")
    print(f"{colors['purple']}F1 Score: {f1:.2f}{colors['reset']}")
    print(f"{colors['orange']}ROC AUC Score: {roc_auc:.2f}{colors['reset']}")
    print("\n")


In [None]:
test_data = pd.read_csv('data/test.csv', index_col=[0])
test_data.drop(['Name'], axis=1, inplace=True)


In [None]:
input_test = covertToProcessedData(test_data)

models['xgb'].fit(inputs, targets)
pred = models['xgb'].predict(input_test)
print(pred)
pred = (pred == 1)
print(pred)

In [None]:
submission_df = pd.DataFrame(columns=['PassengerId'])

In [None]:

submission_df['PassengerId'] = test_data.index
submission_df['Transported'] = pred

In [None]:
submission_df

In [None]:
submission_df.to_csv('prediction.csv', index=False)

In [None]:
data_age0 = data[data.Age == 0]

In [None]:
data_age0

In [None]:
sns.histplot(data=data_age0, x='Age', hue='Transported')

In [None]:
age_bins = [0, 12, 19, 35, 60, float('inf')]
age_labels = ['Child', 'Teenager', 'Young Adult', 'Middle-Aged', 'Senior']
data['Age Groups'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels, right=False)

In [None]:
data