In [2]:
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

In [95]:
def preprocessing(train_data):
    X = train_data.drop(columns=['Survived'])
    y = train_data['Survived']
    
    X['Age'] = X['Age'].fillna(X['Age'].mean())
    
    X = X.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
    
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    X.values[:] = imp.fit_transform(X)
    
    X = pd.get_dummies(X)
    
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
    
    return X, y

In [4]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

df = pd.concat([train_data, test_data], sort=True).reset_index(drop=True)
df['Age'] = df.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))
df['Embarked'] = df['Embarked'].fillna('S')
med_fare = df.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
df['Fare'] = df['Fare'].fillna(med_fare)
df['Deck'] = df['Deck'].replace(['A', 'B', 'C'], 'ABC')
df['Deck'] = df['Deck'].replace(['D', 'E'], 'DE')
df['Deck'] = df['Deck'].replace(['F', 'G'], 'FG')

df['Fare'] = pd.qcut(df['Fare'], 13)
df['Age'] = pd.qcut(df['Age'], 10)
df['Ticket_Frequency'] = df.groupby('Ticket')['Ticket'].transform('count')
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title'] == 'Mrs'] = 1
# X, y = preprocessing(train_data)

# sm = SMOTE(random_state=0)
# X_train, y_train = sm.fit_resample(X_train, y_train)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,,S,8.0500,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236
1305,39.0,C105,C,108.9000,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758
1306,38.5,,S,7.2500,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262
1307,,,S,8.0500,"Ware, Mr. Frederick",0,1308,3,male,0,,359309


In [97]:
xgboost = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, max_depth=3, random_state=0)

In [98]:
cross_val_score(xgboost, X, y, cv=5)

array([0.78212291, 0.80337079, 0.84269663, 0.79213483, 0.81460674])

In [99]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=4000, random_state=0).fit(X, y)

cross_val_score(logreg, X, y, cv=5)

array([0.7877095 , 0.80898876, 0.78651685, 0.76404494, 0.80898876])