In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
# Create a kaggle folder
!mkdir ~/.kaggle
# Copy json file into the kaggle folder
!cp /content/gdrive/MyDrive/YouTube/kaggle.json ~/.kaggle/
# Give full read & write permission only to the owner
!chmod 600 ~/.kaggle/kaggle.json
# Download the titanic dataset
!kaggle competitions download -c titanic
# Unzip the downloaded dataset
!unzip titanic
# Delete the zip file
!rm -rf titanic

# Unmount the gdrive
drive.flush_and_unmount()

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 19.8MB/s]
Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [3]:
# Download lazypredict package
!pip install -q lazypredict
# Import the modules
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from pandas import read_csv

In [4]:
# Load the train data
train = read_csv("/content/train.csv")\
            .drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
# Data / Target
X_train, y_train = train.loc[:, train.columns != "Survived"], train.loc[:, "Survived"]
# Trace
X_train.shape, y_train.shape

((891, 7), (891,))

In [5]:
# Train / validation splitting
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=.3,
                                                  random_state=43)

In [6]:
# Declare a LazyClassifier object
reg = LazyClassifier(verbose=0,
                    ignore_warnings=False,
                    custom_metric=None,
                    random_state=43)
# Fit the models
models, predictions = reg.fit(X_train=X_train,
                              X_test=X_val,
                              y_train=y_train,
                              y_test=y_val)

 24%|██▍       | 7/29 [00:00<00:02, 10.12it/s]

CategoricalNB model failed to execute
Negative values in data passed to CategoricalNB (input X)


 86%|████████▌ | 25/29 [00:02<00:00,  8.34it/s]

StackingClassifier model failed to execute
__init__() missing 1 required positional argument: 'estimators'


100%|██████████| 29/29 [00:03<00:00,  8.46it/s]


In [7]:
predictions

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.81,0.79,0.79,0.81,0.08
XGBClassifier,0.79,0.77,0.77,0.79,0.32
SVC,0.8,0.77,0.77,0.79,0.07
AdaBoostClassifier,0.78,0.76,0.76,0.78,0.23
LabelPropagation,0.78,0.76,0.76,0.78,0.1
LabelSpreading,0.78,0.76,0.76,0.78,0.13
NuSVC,0.79,0.76,0.76,0.78,0.07
RandomForestClassifier,0.78,0.76,0.76,0.78,0.58
KNeighborsClassifier,0.78,0.76,0.76,0.78,0.06
ExtraTreesClassifier,0.78,0.76,0.76,0.78,0.48


In [8]:
# Declare the pipeline of the best model
pipe = reg.provide_models(X_train, X_val, y_train, y_val)["LGBMClassifier"]

In [9]:
# Pre-processing steps
pipe["preprocessor"]

ColumnTransformer(transformers=[('numeric',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')),
                                ('categorical_low',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('encoding',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 Index(['Sex', 'Embarked'], dtype='object')),
                                ('categorical_high',
                                 Pipeline(st

In [10]:
# Model Structure
pipe["classifier"].get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': 43,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [11]:
# Load the test data
test = read_csv("/content/test.csv")

In [12]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [13]:
# Predict by applying the pipeline
test.loc[:, "Survived"] = pipe.predict(X=test)
# Only take "PassengerId", and "Survived" columns
test.loc[:, ["PassengerId", "Survived"]].to_csv("/content/lazy-submission.csv",
                                                index=False)

In [14]:
# Trace
test.loc[:, ["PassengerId", "Survived"]].sample(n=5, random_state=43)

Unnamed: 0,PassengerId,Survived
345,1237,1
207,1099,0
165,1057,0
20,912,0
396,1288,0


In [15]:
# !kaggle competitions submit -c titanic -f /content/lazy-submission.csv -m wai-lazy-submission