classification models using titanic dataset based on sklearn models

Survived - answer
Pclass (класс пассажира) - numeric 3>2>1
Sex - nominal
Age - numeric
SibSp (братья/сестры) - numeric
Parch (родители/дети) - numeric
Fare (стоимость проезда) - numeric 
Cabin - nominal
Embarked - nominal 

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
params={
    'clf__n_estimators':[75,90,100,110,125],
    'clf__max_depth':range(3,13)
}
columns=['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
df = pd.read_csv("../datasets/classification/titanic_train.csv",usecols=columns)
feature_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_df_x,test_df_y = pd.read_csv("../datasets/classification/titanic_test.csv",usecols=test_cols),pd.read_csv("../datasets/classification/titanic_gender_submission.csv")
y_test=test_df_y.drop(columns=['PassengerId'])
x_test=test_df_x.copy()
nom_col=['Sex','Cabin','Embarked']
num_col=[col for col in feature_cols if col not in nom_col]
y_train,x_train=df['Survived'],df.drop(columns=['Survived'])
num_imputer= SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
num_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', StandardScaler())])
cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_col),
    ('cat', cat_pipeline, nom_col)])
pipeline = Pipeline([
    ('prep',preprocessor),
    ('clf',RandomForestClassifier())])
grid = GridSearchCV(pipeline,params,cv=5,scoring='accuracy')
grid.fit(x_train,y_train)
model = grid.best_estimator_
y_pred = model.predict(x_test)
print(f"Лучшие параметры: {grid.best_params_}\nЛучшие показатели: {grid.best_score_}")
print(f"accuracy_score: {accuracy_score(y_test,y_pred)}")
print(f"classification_report: {classification_report(y_test,y_pred)}")
print(f"confusion_matrix: {confusion_matrix(y_test,y_pred)}")
submission = pd.DataFrame({
    'PassengerId': test_df_y['PassengerId'],
    'Survived': y_pred
})
submission.to_csv('submissions/submission_rforest.csv', index=False)

Лучшие параметры: {'clf__max_depth': 11, 'clf__n_estimators': 90}
Лучшие показатели: 0.8294080723118448
accuracy_score: 0.9066985645933014
classification_report:               precision    recall  f1-score   support

           0       0.90      0.96      0.93       266
           1       0.92      0.81      0.86       152

    accuracy                           0.91       418
   macro avg       0.91      0.89      0.90       418
weighted avg       0.91      0.91      0.91       418

confusion_matrix: [[256  10]
 [ 29 123]]


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
params={
    'clf__n_estimators':[125,150,175,200],
    'clf__max_depth':range(1,10),
    'clf__learning_rate': [0.1,0.05]
}
columns=['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
df = pd.read_csv("../datasets/classification/titanic_train.csv",usecols=columns)
feature_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_df_x,test_df_y = pd.read_csv("../datasets/classification/titanic_test.csv",usecols=test_cols),pd.read_csv("../datasets/classification/titanic_gender_submission.csv")
y_test=test_df_y.drop(columns=['PassengerId'])
x_test=test_df_x.copy()
nom_col=['Sex','Cabin','Embarked']
num_col=[col for col in feature_cols if col not in nom_col]
y_train,x_train=df['Survived'],df.drop(columns=['Survived'])
num_imputer= SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
num_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', StandardScaler())])
cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_col),
    ('cat', cat_pipeline, nom_col)])
pipeline = Pipeline([
    ('prep',preprocessor),
    ('clf',GradientBoostingClassifier())])
grid = GridSearchCV(pipeline,params,cv=5,scoring='accuracy')
grid.fit(x_train,y_train)
model = grid.best_estimator_
y_pred = model.predict(x_test)
print(f"Лучшие параметры: {grid.best_params_}\nЛучшие показатели: {grid.best_score_}")
print(f"accuracy_score: {accuracy_score(y_test,y_pred)}")
print(f"classification_report: {classification_report(y_test,y_pred)}")
print(f"confusion_matrix: {confusion_matrix(y_test,y_pred)}")
submission = pd.DataFrame({
    'PassengerId': test_df_y['PassengerId'],
    'Survived': y_pred
})
submission.to_csv('submissions/submission_gradboost.csv', index=False)

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
params={
    'clf__max_iter':[10_000]
}
columns=['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
df = pd.read_csv("../datasets/classification/titanic_train.csv",usecols=columns)
feature_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_df_x,test_df_y = pd.read_csv("../datasets/classification/titanic_test.csv",usecols=test_cols),pd.read_csv("../datasets/classification/titanic_gender_submission.csv")
y_test=test_df_y.drop(columns=['PassengerId'])
x_test=test_df_x.copy()
nom_col=['Sex','Cabin','Embarked']
num_col=[col for col in feature_cols if col not in nom_col]
y_train,x_train=df['Survived'],df.drop(columns=['Survived'])
num_imputer= SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
num_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', StandardScaler())])
cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_col),
    ('cat', cat_pipeline, nom_col)])
pipeline = Pipeline([
    ('prep',preprocessor),
    ('clf',LogisticRegression())])
grid = GridSearchCV(pipeline,params,cv=5,scoring='accuracy')
grid.fit(x_train,y_train)
model = grid.best_estimator_
y_pred = model.predict(x_test)
print(f"Лучшие параметры: {grid.best_params_}\nЛучшие показатели: {grid.best_score_}")
print(f"accuracy_score: {accuracy_score(y_test,y_pred)}")
print(f"classification_report: {classification_report(y_test,y_pred)}")
print(f"confusion_matrix: {confusion_matrix(y_test,y_pred)}")
submission = pd.DataFrame({
    'PassengerId': test_df_y['PassengerId'],
    'Survived': y_pred
})
submission.to_csv('submissions/submission_dtree.csv', index=False)

Лучшие параметры: {'clf__max_iter': 10000}
Лучшие показатели: 0.7979787835038604
accuracy_score: 0.9354066985645934
classification_report:               precision    recall  f1-score   support

           0       0.96      0.94      0.95       266
           1       0.90      0.93      0.91       152

    accuracy                           0.94       418
   macro avg       0.93      0.93      0.93       418
weighted avg       0.94      0.94      0.94       418

confusion_matrix: [[250  16]
 [ 11 141]]


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
params={
    'clf__max_depth':range(1,20),
    'clf__criterion': ["gini","entropy"]
}
columns=['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
df = pd.read_csv("../datasets/classification/titanic_train.csv",usecols=columns)
feature_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_df_x,test_df_y = pd.read_csv("../datasets/classification/titanic_test.csv",usecols=test_cols),pd.read_csv("../datasets/classification/titanic_gender_submission.csv")
y_test=test_df_y.drop(columns=['PassengerId'])
x_test=test_df_x.copy()
nom_col=['Sex','Cabin','Embarked']
num_col=[col for col in feature_cols if col not in nom_col]
y_train,x_train=df['Survived'],df.drop(columns=['Survived'])
num_imputer= SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
num_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', StandardScaler())])
cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_col),
    ('cat', cat_pipeline, nom_col)])
pipeline = Pipeline([
    ('prep',preprocessor),
    ('clf',DecisionTreeClassifier())])
grid = GridSearchCV(pipeline,params,cv=5,scoring='accuracy')
grid.fit(x_train,y_train)
model = grid.best_estimator_
y_pred = model.predict(x_test)
print(f"Лучшие параметры: {grid.best_params_}\nЛучшие показатели: {grid.best_score_}")
print(f"accuracy_score: {accuracy_score(y_test,y_pred)}")
print(f"classification_report: {classification_report(y_test,y_pred)}")
print(f"confusion_matrix: {confusion_matrix(y_test,y_pred)}")
submission = pd.DataFrame({
    'PassengerId': test_df_y['PassengerId'],
    'Survived': y_pred
})
submission.to_csv('submissions/submission_dtree.csv', index=False)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
params={
    'clf__n_neighbors': range(3,50)
}
columns=['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
df = pd.read_csv("../datasets/classification/titanic_train.csv",usecols=columns)
feature_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_cols=['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
test_df_x,test_df_y = pd.read_csv("../datasets/classification/titanic_test.csv",usecols=test_cols),pd.read_csv("../datasets/classification/titanic_gender_submission.csv")
y_test=test_df_y.drop(columns=['PassengerId'])
x_test=test_df_x.copy()
nom_col=['Sex','Cabin','Embarked']
num_col=[col for col in feature_cols if col not in nom_col]
y_train,x_train=df['Survived'],df.drop(columns=['Survived'])
num_imputer= SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
num_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', StandardScaler())])
cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_col),
    ('cat', cat_pipeline, nom_col)])
pipeline = Pipeline([
    ('prep',preprocessor),
    ('clf',KNeighborsClassifier())])
grid = GridSearchCV(pipeline,params,cv=5,scoring='accuracy')
grid.fit(x_train,y_train)
model = grid.best_estimator_
y_pred = model.predict(x_test)
print(f"Лучшие параметры: {grid.best_params_}\nЛучшие показатели: {grid.best_score_}")
print(f"accuracy_score: {accuracy_score(y_test,y_pred)}")
print(f"classification_report: {classification_report(y_test,y_pred)}")
print(f"confusion_matrix: {confusion_matrix(y_test,y_pred)}")
submission = pd.DataFrame({
    'PassengerId': test_df_y['PassengerId'],
    'Survived': y_pred
})
submission.to_csv('submissions/submission_neighbours.csv', index=False)