In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer, PowerTransformer, QuantileTransformer

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [3]:
data = pd.read_csv("../input/titanic/train.csv")

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
data.dropna(axis=0, inplace=True)

In [7]:
data['Cabin'].value_counts()

Cabin
G6             4
B96 B98        4
C23 C25 C27    4
F33            3
D              3
              ..
C91            1
C124           1
C32            1
E34            1
C148           1
Name: count, Length: 133, dtype: int64

In [8]:
data['Embarked'].value_counts()

Embarked
S    116
C     65
Q      2
Name: count, dtype: int64

In [9]:
data['Ticket'].value_counts()

Ticket
113760      4
19950       4
PC 17582    3
35273       3
24160       3
           ..
113784      1
113043      1
28551       1
36928       1
111369      1
Name: count, Length: 127, dtype: int64

In [10]:
label_encoder = LabelEncoder()

data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Cabin'] = label_encoder.fit_transform(data['Cabin'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])
data['Ticket'] = label_encoder.fit_transform(data['Ticket'])

In [11]:
data.drop('Name', axis=1, inplace=True)
#data.drop('Ticket', axis=1, inplace=True)

In [12]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,0,38.0,1,0,109,71.2833,72,0
3,4,1,1,0,35.0,1,0,31,53.1,48,2
6,7,0,1,1,54.0,0,0,55,51.8625,117,2
10,11,1,3,0,4.0,1,1,120,16.7,131,2
11,12,1,1,0,58.0,0,0,26,26.55,43,2


In [13]:
features = data.drop(['Survived','PassengerId'], axis=1) 
target = data['Survived']

features.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,38.0,1,0,109,71.2833,72,0
3,1,0,35.0,1,0,31,53.1,48,2
6,1,1,54.0,0,0,55,51.8625,117,2
10,3,0,4.0,1,1,120,16.7,131,2
11,1,0,58.0,0,0,26,26.55,43,2


In [14]:


rf = RandomForestClassifier()
rf.fit(features,target)

feature_importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': [f'Feature_{i}' for i in range(features.shape[1])],  # Adjust feature names as needed
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
top_5_features = feature_importance_df.head(5)

print(top_5_features)

     Feature  Importance
1  Feature_1    0.242123
2  Feature_2    0.194346
7  Feature_7    0.173866
6  Feature_6    0.161798
5  Feature_5    0.145452


In [15]:
#error

In [16]:
#x = data[['Fare','Sex','Age','Cabin']]
x = data[['Fare','Sex','Age','Cabin','Ticket']]
y = data['Survived']
x.head()

Unnamed: 0,Fare,Sex,Age,Cabin,Ticket
1,71.2833,0,38.0,72,109
3,53.1,0,35.0,48,31
6,51.8625,1,54.0,117,55
10,16.7,0,4.0,131,120
11,26.55,0,58.0,43,26


In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_classifier = RandomForestClassifier(criterion = 'gini',n_estimators=160,random_state=0)
rf_classifier.fit(X_train_scaled, y_train)

y_pred = rf_classifier.predict(X_test_scaled)

In [18]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", confusion)

Accuracy: 0.8648648648648649
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.86      0.71         7
           1       0.96      0.87      0.91        30

    accuracy                           0.86        37
   macro avg       0.78      0.86      0.81        37
weighted avg       0.89      0.86      0.87        37

Confusion Matrix:
 [[ 6  1]
 [ 4 26]]


In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Logistic Regression
log_reg = LogisticRegression(random_state=0)
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)

print("\nLogistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print(classification_report(y_test, y_pred_log_reg))
print(confusion_matrix(y_test, y_pred_log_reg))

# Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=0)
gb_classifier.fit(X_train_scaled, y_train)
y_pred_gb = gb_classifier.predict(X_test_scaled)

print("\nGradient Boosting Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}")
print(classification_report(y_test, y_pred_gb))
print(confusion_matrix(y_test, y_pred_gb))


Logistic Regression
Accuracy: 0.8108108108108109
              precision    recall  f1-score   support

           0       0.50      0.57      0.53         7
           1       0.90      0.87      0.88        30

    accuracy                           0.81        37
   macro avg       0.70      0.72      0.71        37
weighted avg       0.82      0.81      0.82        37

[[ 4  3]
 [ 4 26]]

Gradient Boosting Classifier
Accuracy: 0.8648648648648649
              precision    recall  f1-score   support

           0       0.62      0.71      0.67         7
           1       0.93      0.90      0.92        30

    accuracy                           0.86        37
   macro avg       0.78      0.81      0.79        37
weighted avg       0.87      0.86      0.87        37

[[ 5  2]
 [ 3 27]]
