[dataset](https://docs.google.com/spreadsheets/d/1-GNlfbNK7rrtsXste4tzi1BIldM5AbI656sfym7upbE/edit?usp=sharing)

In [None]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv("/content/train (1).csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [None]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [None]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [None]:
# Applying imputation

si_age = SimpleImputer(strategy="mean")
si_embarked = SimpleImputer(strategy='most_frequent')#mode

df["Age"] = si_age.fit_transform(df[['Age']])
df["Embarked"]= si_embarked.fit_transform(df[['Embarked']]).ravel()

df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,0
SibSp,0
Parch,0
Fare,0
Embarked,0


In [None]:
df["Survived"].unique()

array([0, 1])

In [None]:
df["Survived"].value_counts()

Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
0,549
1,342


In [None]:
enc = OneHotEncoder(sparse_output=False, drop='first')

# Step 2: Fit and transform the 'Sex' and 'Embarked' columns
encoded = enc.fit_transform(df[['Sex', 'Embarked']])  # fit_transform directly

# Step 3: Convert the encoded result into a DataFrame
encoded_df = pd.DataFrame(encoded, columns=enc.get_feature_names_out(['Sex', 'Embarked']))
encoded_df


Unnamed: 0,Sex_male,Embarked_Q,Embarked_S
0,1.0,0.0,1.0
1,0.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,1.0,0.0,1.0
...,...,...,...
886,1.0,0.0,1.0
887,0.0,0.0,1.0
888,0.0,0.0,1.0
889,1.0,0.0,0.0


In [None]:
# Step 4: Concatenate the original DataFrame with the new encoded columns
df = pd.concat([df, encoded_df], axis=1)

df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_male,Embarked_Q,Embarked_S
0,0,3,male,22.000000,1,0,7.2500,S,1.0,0.0,1.0
1,1,1,female,38.000000,1,0,71.2833,C,0.0,0.0,0.0
2,1,3,female,26.000000,0,0,7.9250,S,0.0,0.0,1.0
3,1,1,female,35.000000,1,0,53.1000,S,0.0,0.0,1.0
4,0,3,male,35.000000,0,0,8.0500,S,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,1.0,0.0,1.0
887,1,1,female,19.000000,0,0,30.0000,S,0.0,0.0,1.0
888,0,3,female,29.699118,1,2,23.4500,S,0.0,0.0,1.0
889,1,1,male,26.000000,0,0,30.0000,C,1.0,0.0,0.0


In [None]:
# Step 5: Drop the categorical 'Sex' and 'Embarked' columns
df.drop(columns=['Sex', 'Embarked'], inplace=True)

# Display the modified DataFrame
(df)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,7.2500,1.0,0.0,1.0
1,1,1,38.000000,1,0,71.2833,0.0,0.0,0.0
2,1,3,26.000000,0,0,7.9250,0.0,0.0,1.0
3,1,1,35.000000,1,0,53.1000,0.0,0.0,1.0
4,0,3,35.000000,0,0,8.0500,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,1.0,0.0,1.0
887,1,1,19.000000,0,0,30.0000,0.0,0.0,1.0
888,0,3,29.699118,1,2,23.4500,0.0,0.0,1.0
889,1,1,26.000000,0,0,30.0000,1.0,0.0,0.0


In [None]:
df


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,7.2500,1.0,0.0,1.0
1,1,1,38.000000,1,0,71.2833,0.0,0.0,0.0
2,1,3,26.000000,0,0,7.9250,0.0,0.0,1.0
3,1,1,35.000000,1,0,53.1000,0.0,0.0,1.0
4,0,3,35.000000,0,0,8.0500,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,1.0,0.0,1.0
887,1,1,19.000000,0,0,30.0000,0.0,0.0,1.0
888,0,3,29.699118,1,2,23.4500,0.0,0.0,1.0
889,1,1,26.000000,0,0,30.0000,1.0,0.0,0.0


In [None]:
df["Survived"].value_counts()

Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
0,549
1,342


# How to Choose the Best Method for balaencing classes
1. For minimal data loss with balanced classes:
SMOTE is a good first step as it only creates new data without removing any. It’s easy to use and works well in most scenarios.

2. For dealing with noisy datasets:
SMOTEENN is the best choice when you suspect that your majority class contains noise or misclassified samples. This method can clean up the dataset and improve model performance by removing ambiguous points.

3. For a complex distribution of minority class:
ADASYN would be ideal for datasets where the minority class is not evenly distributed and certain areas need more synthetic samples than others.

4. For overlapping classes:
SMOTETomek is effective when your classes overlap significantly and you need to clean the boundary without losing too much data.


Balancing Techniques using imblearn
#### 1. SMOTE (Synthetic Minority Oversampling Technique)

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

#### 2. SMOTEENN (SMOTE + Edited Nearest Neighbors)

from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)

X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

#### 3. ADASYN (Adaptive Synthetic Sampling)

from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)

X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

#### 4. SMOTETomek (SMOTE + Tomek Links)

from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=42)

X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

In [None]:
from imblearn.over_sampling import SMOTE


In [None]:
# Step 1: Separate the features (X) and target (y)
X = df.drop('Survived', axis=1)  # Features excluding the target
y = df['Survived']  # Target column

# Apply SMOTE first to balance the classes, you can change this line in accordance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)




# Step 4: Convert the resampled data back to a DataFrame if needed
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=['Survived'])

# Step 5: Optionally, combine the resampled features and target back into a single DataFrame
df_resampled = pd.concat([X_resampled_df, y_resampled_df], axis=1)

# Step 6: Check the new class distribution to ensure balance
print(df_resampled['Survived'].value_counts())

Survived
0    549
1    549
Name: count, dtype: int64


In [None]:
df_resampled

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
0,3,22.000000,1,0,7.250000,1.000000,0.0000,1.000000,0
1,1,38.000000,1,0,71.283300,0.000000,0.0000,0.000000,1
2,3,26.000000,0,0,7.925000,0.000000,0.0000,1.000000,1
3,1,35.000000,1,0,53.100000,0.000000,0.0000,1.000000,1
4,3,35.000000,0,0,8.050000,1.000000,0.0000,1.000000,0
...,...,...,...,...,...,...,...,...,...
1093,1,52.336938,0,1,24.967957,0.554354,0.0000,1.000000,1
1094,1,25.835162,0,0,73.844596,0.611721,0.0000,0.000000,1
1095,3,31.614950,2,0,15.703140,0.000000,0.4196,0.580400,1
1096,2,27.752269,0,0,13.212628,0.000000,0.0000,0.752269,1


In [None]:
# Separate features (X) and target (y)
X = df_resampled.drop('Survived', axis=1)  # Features excluding the target
y = df_resampled['Survived']  # Target column

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (878, 8)
X_test shape: (220, 8)
y_train shape: (878,)
y_test shape: (220,)


In [None]:
std = StandardScaler()

# Step 4: Fit and transform the training data
X_train_transformed = std.fit_transform(X_train)

# Step 5: Transform the testing data
X_test_transformed = std.fit_transform(X_test)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [None]:
y_pred = clf.predict(X_test_transformed)
y_pred

array([1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8045454545454546

In [None]:
import pickle

# Assuming clf is your trained classifier and you have fitted it
# clf.fit(X_train_transformed, y_train)

# Save the fitted model to a file
file= open('my foirtfqtg.pkl', 'wb')
pickle.dump(clf, file)

print("Classifier saved successfully.")



Classifier saved successfully.


In [None]:
# Step 2: Prepare unseen data
unseen_data = {
    'Pclass': [2],
    'Sex': ['male'],
    'Age': [30],
    'SibSp': [0],
    'Parch': [0],
    'Fare': [15.5],
    'Embarked': ['C']
}

unseen_df = pd.DataFrame(unseen_data)
unseen_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,male,30,0,0,15.5,C


In [None]:


# Step 3: Transform the unseen data using the fitted encoder
encoded_unseen = enc.transform(unseen_df[['Sex', 'Embarked']])

# Convert the encoded result into a DataFrame
encoded_unseen_df = pd.DataFrame(encoded_unseen, columns=enc.get_feature_names_out(['Sex', 'Embarked']))

# Step 4: Concatenate with the unseen DataFrame
unseen_df = pd.concat([unseen_df, encoded_unseen_df], axis=1)
unseen_df.drop(['Sex', 'Embarked'], axis=1, inplace=True)

# Display the modified unseen DataFrame
print("\nUnseen DataFrame:")
print(unseen_df)


Unseen DataFrame:
   Pclass  Age  SibSp  Parch  Fare  Sex_male  Embarked_Q  Embarked_S
0       2   30      0      0  15.5       1.0         0.0         0.0


In [None]:
scaled_unseen_df = pd.DataFrame(std.fit_transform(unseen_df), columns=unseen_df.columns)


In [None]:
clf.predict(scaled_unseen_df)




array([1])

# or

In [None]:
# Step 1: Load the saved model
model_path = '/content/classifier.joblib'
with open(model_path, 'rb') as file:
    loaded_clf = pickle.load(file)

In [None]:
# Step 3: Make predictions
predictions = loaded_clf.predict(scaled_unseen_df)

# Display predictions
print("Predictions for unseen data:", predictions)

Predictions for unseen data: [1]


