<h1>1. Raw Exploratory Data Analysis :</h1>

In [None]:
#exploring the data set before making any changes

* Importing necessary libraries :

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

* Loading Datasets :


In [None]:
train_df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
sample_df = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")

* Checking dimensions of data:

In [None]:
print("Training Dataset shape is: ",train_df.shape)
print("Testing Dataset shape is: ",test_df.shape)
print("Sample Dataset shape is: ",sample_df.shape)

* Checking the first few rows of the dataset:

In [None]:
train_df.head(15)

* getting a quick overview of the features

In [None]:
train_df.info()

* Checking the data types of each column:

In [None]:
train_df.dtypes

* Checking for missing values

In [None]:
train_df.isnull().sum()

* Checking for unique values

In [None]:
train_df.nunique()

* Exploring numerical features using summary statistics:

In [None]:
train_df.describe()

* Visualizing distributions of numerical features using histograms:

In [None]:
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

for feature in numerical_features:
    plt.figure(figsize=(6, 3))
    plt.hist(train_df[feature], bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

-> The distributions indicate that directly incorporating these features into the model might compromise its performance. To address this, we may consider using only the algorithms that are unaffected by outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(15, 5))

for i, feature in enumerate(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']):
    sns.boxplot(x='Transported', y=feature, data=train_df, ax=axes[i])
    axes[i].set_title(f'{feature} Distribution by Transported')
    axes[i].set_xlabel('Transported')
    axes[i].set_ylabel(feature)

plt.tight_layout()
plt.show()

-> Indeed the continuous features exhibit skewness and contain outliers. Hence, we may explore techniques such as log transformations to address these issues.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

sns.violinplot(data=train_df, x='Transported', y='Spa', palette='Set1')
plt.title('Impact of Spa on Transported')

plt.figure(figsize=(8, 6))

sns.violinplot(data=train_df, x='Transported', y='VRDeck', palette='Set2')
plt.title('Impact of VRDeck on Transported')


plt.figure(figsize=(8, 6))
sns.violinplot(data=train_df, x='Transported', y='RoomService', palette='Set3')
plt.title('Impact of RoomService on Transported')

plt.tight_layout()
plt.show()

In these violin plots, The width of the plot at each value indicates the probability density of the data, providing insights into impact of each feature on predicting the target feature.


-> after reviewing the plots depicting the spending on Spa, VRDeck, and RoomService, it shows distinct separation between the classes, with individuals who spent less on these amenities being predominantly classified as Transported, suggesting the potential for creating a new feature representing total expenditure across these amenities.

* Visualizing categorical features distributions :

In [None]:
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

for feature in categorical_features:
    plt.figure(figsize=(6, 4))
    sns.countplot(data=train_df, x=feature, palette='viridis')
    plt.title(f'Count Plot of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.xticks(rotation=45)  
    plt.show()


* Now let's visualize the relationship between categorical features and the target feature "transported" using count plots :

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

for feature in categorical_features:
    plt.figure(figsize=(8, 6))
    sns.countplot(data=train_df, x=feature, hue='Transported', palette='viridis')
    plt.title(f'Count Plot of {feature} vs Transported')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.xticks(rotation=45)  
    plt.legend(title='Transported', loc='upper right')
    plt.show()

-> Cryosleep has a good difference in proportions, poeple who are in Cryosleep  during the voyage are more likely to be Transported

In [None]:
categorical_features = ['PassengerId','HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP','Name']

for feature in categorical_features:
    cardinality = train_df[feature].nunique()
    print(f"Cardinality of '{feature}': {cardinality}")


* Checking the distribution of the target variable :

In [None]:
plt.figure(figsize=(4, 2))
sns.countplot(data=train_df, x='Transported', palette='Set3')
plt.title('Distribution of Transported')
plt.xlabel('Transported')
plt.ylabel('Count')
plt.show()

-> since the data is balanced, accuracy is a suitable metric for evaluation.
because accuracy measures the overall correctness of the model by considering TP and TN.

<h1>2. Data Preprocessing:</h1>

In [None]:
#for convience we are going to apply preprocessiing on train_df and test_df in the same time

* Missing values:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate percentage of missing values in each feature
missing_percentage = train_df.isnull().mean() * 100

# Plotting the percentage of missing values in each feature
plt.figure(figsize=(4, 3))
sns.barplot(x=missing_percentage.values, y=missing_percentage.index, palette='viridis')
plt.xlabel('Percentage of Missing Values')
plt.ylabel('Features')
plt.title('Percentage of Missing Values in Each Feature')
plt.show()


->PassengerId and transported don't have any missing values. 

In [None]:
plt.figure(figsize=(6, 3))
sns.heatmap(train_df.isnull(), cmap='viridis', cbar=False)
plt.title('Missing Value Distribution')
plt.show()

plt.figure(figsize=(6, 3))
sns.heatmap(train_df.isnull().corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Missing Values')
plt.show()

-> Missing values are independent of the target and for the most part are isolated. Even though only 2% of the data is missing, about 25% of all passengers have at least 1 missing value

* ***Handling missing values: imputing***

-> it is reasonable to attempt to fill in these missing values rather than simply discarding rows.

* Separating the numerical and nominal attributes 

In [None]:


numerical_features =train_df.select_dtypes(include=[np.number])
nominal_features = train_df.select_dtypes(exclude=[np.number])

test_numerical_features =test_df.select_dtypes(include=[np.number])
test_nominal_features = test_df.select_dtypes(exclude=[np.number])

print("Nombre de colonnes numériques :", numerical_features.shape[1])
print("Nombre de colonnes non numériques (catégorielles) :", nominal_features.shape[1])

* applying KNN imputer because it can capture more complex patterns in the data compared to simple imputation methods like median or mean imputation :

In [None]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=5)  

numerical_features_imputed = pd.DataFrame(knn_imputer.fit_transform(numerical_features), columns=numerical_features.columns)
test_numerical_features_imputed = pd.DataFrame(knn_imputer.fit_transform(test_numerical_features), columns=test_numerical_features.columns)


print(numerical_features_imputed.isnull().sum())

In [None]:
'''
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=0)

numerical_features_imputed = pd.DataFrame(imputer.fit_transform(numerical_features), columns=numerical_features.columns)
test_numerical_features_imputed = pd.DataFrame(imputer.fit_transform(test_numerical_features), columns=test_numerical_features.columns)
print(numerical_features_imputed.isnull().sum())
'''

In [None]:
from sklearn.impute import SimpleImputer
 
nom_imputer = SimpleImputer(strategy="most_frequent")

nom_imputer.fit(nominal_features)

nominal_features_imputed = pd.DataFrame(nom_imputer.transform(nominal_features), columns = nominal_features.columns)

nominal_features_imputed.isnull().sum()

In [None]:
 
test_nom_imputer = SimpleImputer(strategy="most_frequent")
 
nom_imputer.fit(test_nominal_features)
 
test_nominal_features_imputed = pd.DataFrame(nom_imputer.transform(test_nominal_features), columns = test_nominal_features.columns)
 

In [None]:
train_df = pd.concat((numerical_features_imputed, nominal_features_imputed), axis = 1)
test_df = pd.concat((test_numerical_features_imputed, test_nominal_features_imputed), axis = 1)


**** Feature engineering: Creating new features***

-> creating family size feature using the last name from name feature :

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Extract family name from Name column
train_df['FamilyName'] = train_df['Name'].str.split(' ').str[-1]
test_df['FamilyName'] = test_df['Name'].str.split(' ').str[-1]

# Count occurrences of each family name to determine family size
train_df['FamilySize'] = train_df.groupby('FamilyName')['FamilyName'].transform('count').astype(float)
test_df['FamilySize'] = test_df.groupby('FamilyName')['FamilyName'].transform('count').astype(float)

# Count the number of unique family names
num_family_names = train_df['FamilyName'].nunique()

# Plot the distribution of family size
plt.figure(figsize=(8, 6))
sns.countplot(data=train_df, x='FamilySize')
plt.xlabel('Family Size')
plt.ylabel('Count')
plt.title('Distribution of Family Size')
plt.show()

print("Number of unique family names:", num_family_names)



In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=train_df , x='FamilySize', hue='Transported')

plt.xlabel('Family Size')
plt.ylabel('Proportion of Transported')
plt.title('Impact of Family Size on Transported Feature')
plt.show()

In [None]:
train_df.drop('FamilyName', axis=1, inplace=True)
train_df.drop('Name', axis=1, inplace=True)
test_df.drop('FamilyName', axis=1, inplace=True)
test_df.drop('Name', axis=1, inplace=True)

-> creating new age groupes features from age feature :

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

train_df['Age_group']=np.nan
train_df.loc[train_df['Age']<=12,'Age_group']='Age_0-12'
train_df.loc[(train_df['Age']>12) & (train_df['Age']<18),'Age_group']='Age_13-17'
train_df.loc[(train_df['Age']>=18) & (train_df['Age']<=25),'Age_group']='Age_18-25'
train_df.loc[(train_df['Age']>25) & (train_df['Age']<=30),'Age_group']='Age_26-30'
train_df.loc[(train_df['Age']>30) & (train_df['Age']<=50),'Age_group']='Age_31-50'
train_df.loc[train_df['Age']>50,'Age_group']='Age_51+'

test_df['Age_group']=np.nan
test_df.loc[test_df['Age']<=12,'Age_group']='Age_0-12'
test_df.loc[(test_df['Age']>12) & (test_df['Age']<18),'Age_group']='Age_13-17'
test_df.loc[(test_df['Age']>=18) & (test_df['Age']<=25),'Age_group']='Age_18-25'
test_df.loc[(test_df['Age']>25) & (test_df['Age']<=30),'Age_group']='Age_26-30'
test_df.loc[(test_df['Age']>30) & (test_df['Age']<=50),'Age_group']='Age_31-50'
test_df.loc[test_df['Age']>50,'Age_group']='Age_51+'


# Plot distribution of new features
plt.figure(figsize=(10, 4))
sns.countplot(data=train_df, x='Age_group', hue='Transported', order=['Age_0-12', 'Age_13-17', 'Age_18-25', 'Age_26-30', 'Age_31-50', 'Age_51+'])
plt.title('Age group distribution')
plt.show()


1. -> this helps simplify data, handle outliers, improve interpretability, and potentially enhance model performance
1. -> we can clearly see that passengers aged between 31 and 50 are the most transported 

In [None]:
#Identifying the condition for zerospending based on the sum of all Expenditure features
exp_feats=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train_df['Total_Expenditure']=train_df[exp_feats].sum(axis=1)
train_df['No_spending']=(train_df['Total_Expenditure']==0).astype(float)

test_df['Total_Expenditure']=test_df[exp_feats].sum(axis=1)
test_df['No_spending']=(test_df['Total_Expenditure']==0).astype(float)

fig=plt.figure(figsize=(6,4))
sns.countplot(data=train_df, x='No_spending', hue='Transported')
plt.title('No spending indicator')
fig.tight_layout()

In [None]:
#extracting the groupe size feature from groups in passenger id 
train_df['Group'] = train_df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(float)
train_df['Group_size']=train_df['Group'].map(lambda x: train_df['Group'].value_counts()[x]).astype(float)

test_df['Group'] = test_df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(float)
test_df['Group_size']=test_df['Group'].map(lambda x: test_df['Group'].value_counts()[x]).astype(float)

fig=plt.figure(figsize=(6,4))
sns.countplot(data=train_df, x='Group_size', hue='Transported')
plt.title('Group size')
fig.tight_layout()

-> people in smaller groups are more transported 

In [None]:
# Creating Solo Traveling feature
train_df['Solo_Traveling'] = (train_df['Group_size'] == 1).astype(float)
train_df['Solo_Traveling'].head()

# Creating Solo Traveling feature
test_df['Solo_Traveling'] = (test_df['Group_size'] == 1).astype(float)
test_df['Solo_Traveling'].head()

Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. 
but we will drop groupe number because of its big cardinality


In [None]:
train_df.drop('PassengerId', axis=1, inplace=True)
train_df.drop('Group', axis=1, inplace=True)

test_df.drop('PassengerId', axis=1, inplace=True)
test_df.drop('Group', axis=1, inplace=True)

In [None]:
# Create new features
train_df['Cabin_deck'] = train_df['Cabin'].apply(lambda x: x.split('/')[0])
train_df['Cabin_number'] = train_df['Cabin'].apply(lambda x: x.split('/')[1] if '/' in x else np.nan).astype(float)
train_df['Cabin_side'] = train_df['Cabin'].apply(lambda x: x.split('/')[2] if '/' in x else np.nan)

test_df['Cabin_deck'] = test_df['Cabin'].apply(lambda x: x.split('/')[0])
test_df['Cabin_number'] = test_df['Cabin'].apply(lambda x: x.split('/')[1] if '/' in x else np.nan).astype(float)
test_df['Cabin_side'] = test_df['Cabin'].apply(lambda x: x.split('/')[2] if '/' in x else np.nan)

# Drop 'Cabin' column
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)

# Plot distribution of new features
fig = plt.figure(figsize=(5, 5))
plt.subplot(2, 1, 1)
sns.countplot(data=train_df, x='Cabin_deck', hue='Transported', order=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'])
plt.title('Cabin deck')

plt.subplot(2, 1, 2)
sns.countplot(data=train_df, x='Cabin_side', hue='Transported')
plt.title('Cabin side')
fig.tight_layout()
plt.show()

In [None]:
# New features - training set
train_df['Cabin_region1'] = (train_df['Cabin_number'] < 300).astype(float)   # one-hot encoding
train_df['Cabin_region2'] = ((train_df['Cabin_number'] >= 300) & (train_df['Cabin_number'] < 600)).astype(float)
train_df['Cabin_region3'] = ((train_df['Cabin_number'] >= 600) & (train_df['Cabin_number'] < 900)).astype(float)
train_df['Cabin_region4'] = ((train_df['Cabin_number'] >= 900) & (train_df['Cabin_number'] < 1200)).astype(float)
train_df['Cabin_region5'] = ((train_df['Cabin_number'] >= 1200) & (train_df['Cabin_number'] < 1500)).astype(float)
train_df['Cabin_region6'] = ((train_df['Cabin_number'] >= 1500) & (train_df['Cabin_number'] < 1800)).astype(float)
train_df['Cabin_region7'] = (train_df['Cabin_number'] >= 1800).astype(float)

test_df['Cabin_region1'] = (test_df['Cabin_number'] < 300).astype(float)   # one-hot encoding
test_df['Cabin_region2'] = ((test_df['Cabin_number'] >= 300) & (test_df['Cabin_number'] < 600)).astype(float)
test_df['Cabin_region3'] = ((test_df['Cabin_number'] >= 600) & (test_df['Cabin_number'] < 900)).astype(float)
test_df['Cabin_region4'] = ((test_df['Cabin_number'] >= 900) & (test_df['Cabin_number'] < 1200)).astype(float)
test_df['Cabin_region5'] = ((test_df['Cabin_number'] >= 1200) & (test_df['Cabin_number'] < 1500)).astype(float)
test_df['Cabin_region6'] = ((test_df['Cabin_number'] >= 1500) & (test_df['Cabin_number'] < 1800)).astype(float)
test_df['Cabin_region7'] = (test_df['Cabin_number'] >= 1800).astype(float)

# Plot distribution of new features
plt.figure(figsize=(10, 4))
train_df['Cabin_regions_plot'] = (train_df['Cabin_region1'] + 2 * train_df['Cabin_region2'] + 3 * train_df['Cabin_region3'] +
                                  4 * train_df['Cabin_region4'] + 5 * train_df['Cabin_region5'] + 6 * train_df['Cabin_region6'] +
                                  7 * train_df['Cabin_region7']).astype(float)
sns.countplot(data=train_df, x='Cabin_regions_plot', hue='Transported')
plt.title('Cabin regions')
train_df.drop('Cabin_regions_plot', axis=1, inplace=True)


The location of the individual cabins also had an impact on the transportation of people.


-> checking the features that we added:

In [None]:
train_df.dtypes

In [None]:
test_df.dtypes

<h1> Data type separation, Transformation, Encoding, and Scaling</h1>

In [None]:
# Separating the numerical and nominal attributes again after future engineering 
new_numerical_features =train_df.select_dtypes(include=[np.number])
new_nominal_features = train_df.select_dtypes(exclude=[np.number])

new_test_numerical_features =test_df.select_dtypes(include=[np.number])
new_test_nominal_features = test_df.select_dtypes(exclude=[np.number])

print("numériques :", new_numerical_features.shape[1])
print("catégorielles :", new_nominal_features.shape[1])

In [None]:
new_nominal_features.dtypes

In [None]:
train_df.head()

In [None]:
# Applying log transformation to numerical features 
for feature in new_numerical_features.columns:
    train_df[feature] = np.log1p(train_df[feature].values)
train_df.head()

In [None]:
for feature in new_test_numerical_features.columns:
    test_df[feature] = np.log1p(test_df[feature].values)
test_df.head()

* Encoding categorical variables: Using label encoding to convert categorical variables into numerical format 

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Encoding categorical features using LabelEncoder
for feature in new_nominal_features:
    train_df[feature] = label_encoder.fit_transform(train_df[feature])
train_df.head()

In [None]:
test_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

test_label_encoder = LabelEncoder()

for feature in new_test_nominal_features:
    test_df[feature] = label_encoder.fit_transform(test_df[feature])
test_df.head()

-> checking coorelation after encoding categorical features:


* Spliting the data into features (X) and the target variable (y):

In [None]:
X = train_df.drop(columns=['Transported'])   
y = train_df['Transported']  

# Printing to verify the split
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
test_df = scaler.fit_transform(test_df)

In [None]:
#applying PCA
from sklearn.decomposition import PCA

# Applying PCA
pca = PCA(n_components=0.95)  # Keep 95% of the variance
X_pca = pca.fit_transform(X)

# Check the shape of the transformed data
print("Shape of X after PCA:", X_pca.shape)


<h1>3. Model Selection:</h1>

In [None]:
#spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h2> 1) Logistic-Regression. </h2>

In [None]:
#Instantiate the Model
logistic_regression_model = LogisticRegression(max_iter=10000)

#Fit the Model
logistic_regression_model.fit(X_train, y_train)

#Make Predictions
y_pred = logistic_regression_model.predict(X_test)

#Evaluating the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

<h2> 2) Support Vector Machine. </h2>

In [None]:
from sklearn.svm import SVC

# Instantiate the Model
svm_model = SVC()

# Step 4: Fit the Model
svm_model.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred = svm_model.predict(X_test)

# Step 6: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))

<h2> 3) Naive Bayes. </h2>

In [None]:
from sklearn.naive_bayes import GaussianNB

#   Instantiate the Model
naive_bayes_model = GaussianNB()

# Step 4: Fit the Model
naive_bayes_model.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred = naive_bayes_model.predict(X_test)

# Step 6: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))

# Tree based ML models :

<h2> 4) Decision trees </h2>

In [None]:
from sklearn.tree import DecisionTreeClassifier

#  Instantiate the Model
decision_tree_model = DecisionTreeClassifier()

# Step 4: Fit the Model
decision_tree_model.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred = decision_tree_model.predict(X_test)

# Step 6: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))

<h2> 5) ADA Boost</h2>

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Instantiate the Model
adaboost_model = AdaBoostClassifier()

# Step 4: Fit the Model
adaboost_model.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred = adaboost_model.predict(X_test)

# Step 6: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))


<h2> 6) gradient Boost</h2>

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#  Instantiate the Model
gradientboost_model = GradientBoostingClassifier()

# Step 4: Fit the Model
gradientboost_model.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred = gradientboost_model.predict(X_test)

# Step 6: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))


<h2> 7) Random Forest</h2>

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the Model
randomforest_model = RandomForestClassifier()

# Step 4: Fit the Model
randomforest_model.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred = randomforest_model.predict(X_test)

# Step 6: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))


<h2>8) XGBoost</h2>

In [None]:
import xgboost as xgb

xgboost_model = xgb.XGBClassifier()

xgboost_model.fit(X_train, y_train)

y_pred = xgboost_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



<h2>9) LightGBM:</h2>

In [None]:
import lightgbm as lgb

# Initialize the LightGBM classifier
lightgbm_model = lgb.LGBMClassifier(verbose=0)

# Train the model
lightgbm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lightgbm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

<h2>10) CatBoost:</h2>

In [None]:
from catboost import CatBoostClassifier

# Initialize the CatBoost classifier with verbose set to False
catboost_model = CatBoostClassifier(verbose=False)

# Train the model
catboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred) 

print("Accuracy:", accuracy) 

In [None]:
from sklearn.metrics import accuracy_score

# Define a dictionary to hold the models
models = {
    'Logistic Regression': logistic_regression_model,
    'Support Vector Machine': svm_model,
    'Naive Bayes': naive_bayes_model,
    'Decision Trees': decision_tree_model,
    'AdaBoost': adaboost_model,
    'Gradient Boost': gradientboost_model,
    'Random Forest': randomforest_model,
    'XGBoost': xgboost_model,
    'LightGBM': lightgbm_model,
    'CatBoost': catboost_model
}

# Evaluate each model and store the accuracies
accuracies = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy

# Create a DataFrame to store accuracies
accuracy_df = pd.DataFrame(accuracies.items(), columns=['Model', 'Accuracy'])

# Sort the DataFrame by Accuracy in descending order
accuracy_df = accuracy_df.sort_values(by='Accuracy', ascending=False)

# Display the DataFrame
accuracy_df

In [None]:
#xgboost_model_tunning 

from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Define XGBoost model
xgboost_model = xgb.XGBClassifier()

# Define parameters for hyperparameter tuning
parameters3 = {
    "n_estimators": [50, 100, 150],
    "random_state": [0, 42, 50],
    "learning_rate": [0.1, 0.3, 0.5, 1.0]
}
params_XGB_best ={'lambda': 3.0610042624477543, 
             'alpha': 4.581902571574289, 
             'colsample_bytree': 0.9241969052729379, 
             'subsample': 0.9527591724824661, 
             'learning_rate': 0.06672065863100594, 
             'n_estimators': 725, #initial value is 651
             'max_depth': 5, 
             'min_child_weight': 1, 
             'num_parallel_tree': 1}
# Perform GridSearchCV
grid_search3 = GridSearchCV(xgboost_model, parameters3, cv=5, n_jobs=-1)
grid_search3.fit(X, y)

# Get the best score
best_score = grid_search3.best_score_

# Get the best parameters
best_parameters = grid_search3.best_params_

# Create a new XGBoost model with the best parameters
xgboost_model_tuned = xgb.XGBClassifier(**params_XGB_best)
xgboost_model_tuned.fit(X_train, y_train)

# Make predictions on the test set
y_pred3 = xgboost_model_tuned.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred3)

In [None]:
accuracy

In [None]:

from sklearn.model_selection import GridSearchCV

param_grid = {
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 200]
}

grid_search = GridSearchCV(lightgbm_model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
lightgbm_model_tuned = grid_search.best_estimator_

test_score = lightgbm_model_tuned.score(X_test, y_test)
print("Best hyperparameters:", best_params)
print("Test set accuracy:", test_score)




In [None]:
#catboost_model tunning 
parameters2 = {"learning_rate":[0.1,0.3,0.5,0.6,0.7],
              "random_state":[0,42,48,50],
               "depth":[8,9,10],
               "iterations":[35,40,50]}

# Perform GridSearchCV
grid_search3 = GridSearchCV(catboost_model, parameters3, cv=5, n_jobs=-1)
grid_search3.fit(X_train, y_train)

# Get the best score
best_score = grid_search3.best_score_

# Get the best parameters
best_parameters = grid_search3.best_params_

# Create a new XGBoost model with the best parameters
catboost_model_tuned = CatBoostClassifier(verbose=False, **best_parameters)

catboost_model_tuned.fit(X_train, y_train)

# Make predictions on the test set
y_predict = catboost_model_tuned.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_predict)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5]
}

# Perform GridSearchCV
grid_search = GridSearchCV(gradientboost_model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
gradientboost_model_tuned = grid_search.best_estimator_

# Calculate the test set accuracy
test_score = gradientboost_model_tuned.score(X_test, y_test)

# Print the results
print("Best hyperparameters:", best_params)
print("Test set accuracy:", test_score)


In [None]:
from sklearn.ensemble import StackingClassifier

# 'Random Forest': randomforest_model,

#stacking models 
stacking_model = StackingClassifier(estimators=[('LightGBM', lightgbm_model_tuned), 
                                                ('CatBoost', catboost_model_tuned),
                                                ("XGBoost", xgboost_model_tuned),
                                                ('Gradient Boost', gradientboost_model_tuned),
                                                ('AdaBoost', adaboost_model)
                                                ])

stacking_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = stacking_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred) 

print("Accuracy:", accuracy) 


<h1>4. Model Tuning:</h1>

* Using techniques like grid search or random search to find the optimal hyperparameters for the selected models.

In [None]:
accuracy*100

<h1>5. Model Evaluation:</h1>

* Evaluating the tuned models on the validation set using appropriate metrics (e.g., accuracy, precision, recall, F1-score).
* Choosing the best-performing model based on the evaluation metrics.

In [None]:
from sklearn.model_selection import cross_val_predict

# Perform cross-validation predictions
cross_val_preds = cross_val_predict(stacking_model, X_test, y_test, cv=5)

# Calculate accuracy
accuracy = accuracy_score(y_test, cross_val_preds) 

print("Accuracy:", accuracy) 

<h1>6. Predictions:</h1>

* Making predictions using the selected model on the test data.


In [None]:
test_pred = xgboost_model_tuned.predict(test_df)

* Preparing the submission file in the specified format (PassengerId,Transported) with predictions for the test set.

In [None]:
# Sample submission (to get right format)
sub=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

# Add predictions
sub['Transported']=test_pred

# Replace 0 to False and 1 to True
sub=sub.replace({0:False, 1:True})

# Prediction distribution
plt.figure(figsize=(6,6))
sub['Transported'].value_counts().plot.pie(explode=[0.1,0.1], autopct='%1.1f%%', shadow=True, textprops={'fontsize':16}).set_title("Prediction distribution")

In [None]:
# Output to csv
sub.to_csv('submission.csv', index=False)