In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [42]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [43]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

In [44]:
train.head()
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
train.corr(numeric_only=True).style.background_gradient(cmap='BuGn')

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [46]:
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [47]:
train.isna().sum()
test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [48]:
train['Embarked'] = train.Embarked.fillna(train.Embarked.dropna().max())
test['Fare'] = test.Fare.fillna(test.Fare.dropna().mean())

In [49]:
guess_ages = np.zeros((2,3))

In [50]:
combine = [train , test]

In [51]:
for ds in combine:
	ds['Sex'] = ds['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [52]:
for ds in combine:
	for i in range(0, 2):
		for j in range(0, 3):
			guess_df = ds[(ds['Sex'] == i) & \
			(ds['Pclass'] == j+1)]['Age'].dropna()
			age_guess = guess_df.median()
			# Convert random age float to nearest .5 age
			guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
	for i in range(0, 2):
		for j in range(0, 3):
			ds.loc[ (ds.Age.isnull()) & (ds.Sex == i) & (ds.Pclass == j+1), \
			'Age'] = guess_ages[i,j]

	ds['Age'] = ds['Age'].astype(int)

In [53]:
train.head()
train.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,29.072952,0.523008,0.381594,32.204208
std,0.486592,0.836071,0.47799,13.326339,1.102743,0.806057,49.693429
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,21.0,0.0,0.0,7.9104
50%,0.0,3.0,0.0,26.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,36.0,1.0,0.0,31.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [54]:
X_train = pd.get_dummies(train.drop(['Survived'], axis=1))
X_test = pd.get_dummies(test)
y_train = train['Survived']

In [55]:
def print_scores(model, X_train, Y_train, predictions, cv_splites=10):
	print("The mean accuracy score of the train data is %.5f" % model.score(X_train, Y_train))
	CV_scores = cross_val_score(model, X_train, Y_train, cv=cv_splites)
	print("The individual cross-validation scores are: \n",CV_scores)
	print("The minimum cross-validation score is %.3f" % min(CV_scores))
	print("The maximum cross-validation score is %.3f" % max(CV_scores))
	print("The mean cross-validation score is %.5f Ã‚Â± %0.2f" % (CV_scores.mean(), CV_scores.std() * 2))

In [56]:
model = RandomForestClassifier(n_estimators= 80 ,max_depth=5 , max_features=8, min_samples_split=3 ,random_state=7)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print_scores(model, X_train, y_train, predictions)

The mean accuracy score of the train data is 0.85859
The individual cross-validation scores are: 
 [0.76666667 0.85393258 0.75280899 0.91011236 0.88764045 0.80898876
 0.80898876 0.78651685 0.87640449 0.84269663]
The minimum cross-validation score is 0.753
The maximum cross-validation score is 0.910
The mean cross-validation score is 0.82948 Ã‚Â± 0.10
