In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
missing_val_count_by_column = (train_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
test_data.describe(include='all')

In [None]:
y = train_data['Survived']
features=['Pclass','Sex',"Age","SibSp",'Parch','Fare','Embarked']
X=train_data[features]
X.describe(include="all")

categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]


In [None]:
# #split data for training and validation
# from sklearn.model_selection import train_test_split
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
#                                                       random_state=0)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])



model = XGBClassifier(n_estimators=500, learning_rate=0.05, random_state=0)

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)
                            ])





# # Preprocessing of validation data, get predictions
# preds = my_pipeline.predict(X_valid)


In [None]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

In [None]:
print("Average MAE score (across experiments):")
print(scores.mean())

In [None]:



# #handle non numeric data
# from sklearn.preprocessing import OrdinalEncoder

# # Get list of categorical variables
# s = (X_train.dtypes == 'object')
# object_cols = list(s[s].index)

# print("Categorical variables:")
# print(object_cols)

# # Make copy to avoid changing original data 
# label_X_train = X_train.copy()
# label_X_valid = X_valid.copy()

# # Apply ordinal encoder to each column with categorical data
# ordinal_encoder = OrdinalEncoder()
# label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
# label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

# print(label_X_train)

In [None]:
# #handle missing values
# from sklearn.impute import SimpleImputer

# # Imputation
# my_imputer = SimpleImputer()
# imputed_X_train = pd.DataFrame(my_imputer.fit_transform(label_X_train))
# imputed_X_valid = pd.DataFrame(my_imputer.transform(label_X_valid))

# # Imputation removed column names; put them back
# imputed_X_train.columns = label_X_train.columns
# imputed_X_valid.columns = label_X_train.columns

In [None]:
# #train Model
# from sklearn.ensemble import RandomForestClassifier

# my_model = RandomForestClassifier(random_state=1)
# my_model.fit(imputed_X_train,y_train)
# predictions = my_model.predict(imputed_X_valid)

In [None]:
# from sklearn.metrics import mean_absolute_error

# # Evaluate the model
# score = mean_absolute_error(y_valid, preds)
# print('MAE:', score)

In [None]:
# check MAE

# mean_absolute_error(predictions,y_valid)

In [None]:
#preprocess test data
X_test = test_data[features]
# #handle non numeric data
# from sklearn.preprocessing import OrdinalEncoder

# # Get list of categorical variables
# s = (X_test .dtypes == 'object')
# object_cols = list(s[s].index)


# # Make copy to avoid changing original data 
# label_X_test  = X_test.copy()


# # Apply ordinal encoder to each column with categorical data
# ordinal_encoder = OrdinalEncoder()
# label_X_test[object_cols] = ordinal_encoder.fit_transform(X_test[object_cols])


In [None]:
# #handle missing values
# from sklearn.impute import SimpleImputer

# # Imputation
# my_imputer = SimpleImputer()
# imputed_X_test= pd.DataFrame(my_imputer.fit_transform(label_X_test))


# # Imputation removed column names; put them back
# imputed_X_test.columns = label_X_test.columns

In [None]:
my_pipeline.fit(X,y)
final_predictions = my_pipeline.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': final_predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")