In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df_train = pd.read_csv(r".\train.csv",header=0)

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df_train = df_train.drop(["Name","Ticket","Cabin"], axis = 1)

In [5]:
df_train.groupby(["Embarked"])["PassengerId"].count()

Embarked
C    168
Q     77
S    644
Name: PassengerId, dtype: int64

In [6]:
df_train["Embarked"] = df_train["Embarked"].fillna("S")

In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(df_train[['Age']])
df_train['Age'] = imputer.transform(df_train[['Age']])

In [8]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [9]:
gender = {'male' : 1, 'female' : 0}
df_train['Sex'] = df_train['Sex'].str.lower().map(gender)
Embarked = {'C' : 0, 'Q' : 1, 'S' : 2}
df_train['Embarked'] = df_train['Embarked'].map(Embarked)

In [10]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208,1.536476
std,257.353842,0.486592,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429,0.791503
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,1.0
50%,446.0,0.0,3.0,1.0,29.699118,0.0,0.0,14.4542,2.0
75%,668.5,1.0,3.0,1.0,35.0,1.0,0.0,31.0,2.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(df_train[['Fare']])
df_train['Fare'] = scaler.transform(df_train[['Fare']])

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 62.8 KB


In [13]:
from sklearn.model_selection import train_test_split,GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(["Survived"],axis=1), df_train["Survived"], test_size=0.33, random_state=42)

In [14]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 500, max_depth = 4, max_features = 3, bootstrap = True, random_state = 18).fit(X_train, y_train)

In [15]:

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
confusion_matrix(y_train,clf.predict(X_train))

array([[358,  16],
       [ 67, 155]], dtype=int64)

In [16]:
accuracy_score(y_train,clf.predict(X_train))

0.860738255033557

In [17]:
f1_score(y_train,clf.predict(X_train))

0.7888040712468193

In [18]:
grid = { 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy'],
    'random_state' : [18]
}

In [19]:
rf_cv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid, cv= 5)
rf_cv.fit(X_train, y_train)
rf_cv.best_params_

KeyboardInterrupt: 

In [None]:
clf_2 =  RandomForestClassifier(**rf_cv.best_params_).fit(X_train, y_train)


In [None]:
confusion_matrix(y_train,clf_2.predict(X_train))

array([[364,  10],
       [ 60, 162]], dtype=int64)

In [None]:
accuracy_score(y_train,clf_2.predict(X_train))

0.8825503355704698

In [None]:
f1_score(y_train,clf_2.predict(X_train))

0.8223350253807107