In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import itertools
import plotly.graph_objects as go

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from catboost import CatBoostRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


import warnings
warnings.simplefilter(action="ignore")

In [None]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
test_data = pd.read_csv('test.csv')
print(test_data.shape)
test_data.head(2)

In [None]:
train_data = pd.read_csv('train.csv')
print(train_data.shape)
train_data.head(2)

In [None]:
gen_sub = pd.read_csv("gender_submission.csv")
print(gen_sub.shape)
gen_sub.head(2)

In [None]:
sub_id=test_data["PassengerId"]
sub_id.head(1)

In [None]:
first_train = train_data.shape[0]
first_train

In [None]:
df = pd.concat([train_data, test_data]).reset_index()

In [None]:
df.head(2)

In [None]:
# Preliminary examination of the data set

def check_df(dataframe, head=5):
    print('##################### Shape #####################')
    print(dataframe.shape)
    print('##################### Types #####################')
    print(dataframe.dtypes)
    print('##################### Head #####################')
    display(dataframe.head(head))
    print('##################### Tail #####################')
    display(dataframe.tail(head))
    print('##################### NA #####################')
    print(dataframe.isnull().sum())
    print('##################### Quantiles #####################')
    display(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(df)

In [None]:
drop_list = ["Name","Ticket","Cabin", 'PassengerId', 'index']

In [None]:
df.drop(drop_list, axis =1, inplace=True)

In [None]:
df.head(2)

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"] 

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat

    cat_cols = [col for col in cat_cols if col not in cat_but_car] 

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"] 

    num_cols = [col for col in num_cols if col not in num_but_cat] 
    
    print(f"Observations: {dataframe.shape[0]}") 
    print(f"Variables: {dataframe.shape[1]}") 
    print(f'cat_cols: {len(cat_cols)}') 
    print(f'num_cols: {len(num_cols)}') 
    print(f'cat_but_car: {len(cat_but_car)}') 
    print(f'num_but_cat: {len(num_but_cat)}') 


    return cat_cols, num_cols, cat_but_car, num_but_cat

In [None]:
cat_cols, num_cols, cat_but_car,  num_but_cat = grab_col_names(df)

In [None]:
cat_cols

In [None]:
cat_cols.remove("Survived")
cat_cols

In [None]:
num_cols

In [None]:
cat_but_car

In [None]:
num_but_cat

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        'Ratio': 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print('##########################################')
    if plot:
        plt.figure(figsize=(12,6))
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show(block=True)

In [None]:
for col in cat_cols:
    cat_summary(df, col, plot=True)

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
sns.countplot(x='Survived', data=df, palette='Set1')
ax.grid(color='gray', alpha=0.25)  # Changing the grid lines color
plt.title("Count OF Survived ")
plt.show()

In [None]:
def graph(feature):
    survived = df[df['Survived']==1][feature].value_counts()
    dead = df[df['Survived']==0][feature].value_counts()
    df1 = pd.DataFrame([survived,dead])
    df1.index = ['Survived','Dead']
    df1.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
graph('Pclass')

In [None]:
graph("Sex")

In [None]:
graph('Embarked')

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    display(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

In [None]:
for col in num_cols:
    num_summary(df, col, plot=True)

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col, plot=False):
    print(pd.DataFrame({'TARGET_MEAN': dataframe.groupby(categorical_col)[target].mean()}), end='\n\n\n')
    if plot:
        sns.barplot(x=categorical_col, y=target, data=dataframe)
        plt.show(block=True)

In [None]:
for col in cat_cols:
    target_summary_with_cat(df, 'Survived', col, plot=True)

In [None]:
def target_summary_with_num(dataframe, target, numerical_col, plot=False):
    print(pd.DataFrame({numerical_col+'_mean': dataframe.groupby(target)[numerical_col].mean()}), end='\n\n\n')
    if plot:
        sns.barplot(x=target, y=numerical_col, data=dataframe)
        plt.show(block=True)

In [None]:
for col in num_cols:
    target_summary_with_cat(df, 'Survived', col, plot=False)

In [None]:
np.log1p(df["Survived"]).hist(bins=50)
plt.show(block=True)

In [None]:
def high_correlated_cols(dataframe, plot=False, corr_th=0.70):
    corr = dataframe.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    if plot:
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(rc={'figure.figsize': (12, 8)})
        sns.heatmap(corr, cmap="RdBu", annot=True, fmt=".2f")  # annot=True added here
        plt.show()
    return drop_list

In [None]:
high_correlated_cols(df, plot=True)
corr = df[num_cols].corr()
corr

In [None]:
outcome_counts = df['Survived'].value_counts()

# Calculate the total number of passengers
total_passengers = outcome_counts.sum()

# Calculate the percentages
percentages = outcome_counts / total_passengers * 100

# Create labels with both quantity and percentage
labels = [f'0 - Not Survived\n({outcome_counts[0]} / {percentages[0]:.1f}%)',
          f'1 - Survived\n({outcome_counts[1]} / {percentages[1]:.1f}%)']

# Plot the pie chart with labels and percentages
plt.figure(figsize=(8, 6))
plt.pie(outcome_counts, labels=labels, autopct='%1.1f%%', colors=['purple', 'lightgray'])
plt.title('Distribution of the Outcome Variable')
plt.show()

In [None]:
df.isnull().sum()

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
df.isnull().sum()

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
def check_outlier(dataframe, col_name, plot=False):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    outliers = dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)]
    if outliers.any(axis=None):
        if plot:
            plt.figure(figsize=(8, 6))
            sns.boxplot(x=dataframe[col_name])
            plt.title(f'Outliers in {col_name}')
            plt.show()
        return True
    else:
        return False

In [None]:
def replace_with_thresholds(dataframe, variable, q1=0.05, q3=0.95):
    low_limit, up_limit = outlier_thresholds(dataframe, variable, q1=0.05, q3=0.95)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
for col in ['Age', 'Fare']:
    print(col, check_outlier(df, col, plot=True))
    if check_outlier(df, col, plot=True):
        replace_with_thresholds(df, col)

In [None]:
for col in ['Age', 'Fare']:
    print(col, check_outlier(df, col))

In [None]:
df["TotalFamily"] = df["SibSp"] + df["Parch"]
df['Alone'] = (df['SibSp']==0) & (df['Parch']==0)

In [None]:

bins= [0,2,17,65,100]
labels = ['Baby','Child','Adult','Elderly']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

In [None]:
bins= [-1,130,260,390,520]
labels = ['Low','Medium','High','Very High']
df['FareGroup'] = pd.cut(df['Fare'], bins=bins, labels=labels, right=True)
df.head(2)

In [None]:
cat_cols, num_cols, cat_but_car,  num_but_cat = grab_col_names(df)

In [None]:
cat_cols

In [None]:
cat_cols.remove("Survived")
cat_cols = ['Pclass', 'SibSp', 'Parch', 'TotalFamily', 'Alone', 'AgeGroup', 'FareGroup', 'Embarked', ]

In [None]:

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [None]:
binary_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique() == 2]
binary_cols

In [None]:
for col in binary_cols:
    df = label_encoder(df, col)
df.head(1)

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [None]:
df = one_hot_encoder(df, cat_cols, drop_first=True)

In [None]:
df.head(10)

In [None]:
num_cols = ['Age', 'Fare']
num_cols

In [None]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head(2)

In [None]:
train = df[:first_train]

test_1 = df[first_train:]

In [None]:
y = train['Survived']

# Creating Independent Variables.

X = train.drop('Survived', axis=1)

# Splitting the Data into Training and Test Sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

In [None]:
models = [
    ('LogisticRegression', LogisticRegression(max_iter=1000)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('GaussianNB', GaussianNB()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('AdaBoostClassifier', AdaBoostClassifier()),
    ('BaggingClassifier', BaggingClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    ('SVC', SVC(probability=True))
]

# Train and evaluate models
model_names = []
accuracies = []
train_accuracies=[]
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_train=model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    accuracy = accuracy_score(y_test, y_pred)
    model_names.append(name)
    accuracies.append(accuracy)
    train_accuracies.append(train_accuracy)

model_performance = pd.DataFrame({'Model': model_names, 'Train Accuracy':train_accuracies, 'Test Accuracy':accuracies})
model_performance


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
acc_log = round(model.score(X_train, y_train) * 100, 2)
acc_log

In [None]:
cb_model = CatBoostRegressor(loss_function='RMSE', logging_level='Silent')
cb_model.fit(X_train, y_train)
preds = cb_model.predict(X_test) 

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, preds))
print('MSE:', metrics.mean_squared_error(y_test, preds))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, preds)))
train.head(1)

In [None]:
test_1.head(1)

In [None]:
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)

# Prediction using Random Forest Classifier Model

y_pred = rf_model.predict(X_test)

print("RandomForestClassifier:")
print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 4)}")
print(f"Recall: {round(recall_score(y_pred,y_test),4)}")
print(f"Precision: {round(precision_score(y_pred,y_test), 4)}")
print(f"F1: {round(f1_score(y_pred,y_test), 4)}")
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 4)}")

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num])
    plt.title(f'Feature Importance - {model.__class__.__name__}')
    plt.tight_layout()
    plt.show(block=True)
    if save:
        plt.savefig('importances.png')

In [None]:
model_name = [rf_model, dt_model, xgb_model, lgbm_model]
for i in model_name:
    plot_importance(i, X)

In [None]:
test_1=test_1.drop("Survived",axis=1)
test_1

In [None]:
Sub = model.predict(test_1)
Sub

In [None]:
submission = pd.DataFrame({
        "PassengerId": sub_id,
        "Survived": Sub
    })

submission.head()

In [None]:
submission.to_csv('submission.csv',index=False)

In [None]:
import zipfile
import os

def create_zip(source_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        if os.path.isfile(source_path):
            zipf.write(source_path, os.path.basename(source_path))
        elif os.path.isdir(source_path):
            for root, _, files in os.walk(source_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, source_path)
                    zipf.write(file_path, arcname)

source_path = 'catboost_info' 
zip_file_path = 'catboost_info.zip'  
create_zip(source_path, zip_file_path)