In [None]:
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

**Read Data**

In [163]:
import pandas as pd
from pandas_profiling import ProfileReport

In [None]:
df = pd.read_csv('/content/train.csv')
df.info()

In [166]:

profile = ProfileReport(
    df,
    title="Pandas Profiling Report",
    explorative=True,
    plot={"dpi": 200, "image_format": "png"},
)

In [None]:
# Dig into database
profile.to_notebook_iframe()

**Data Preprocessing**

In [167]:
def preprocessing(df):
  #Fill missing values in HomePlanet with Missing
  df['HomePlanet'].fillna('Missing', inplace = True)
  #CryoSleep -Highly Corelated to Transported
  df['CryoSleep'].fillna('Missing', inplace = True)
  #Cabin preprocessing - extract Deck and Side
  df['TempCabin'] = df['Cabin'].apply(lambda x: split_cabin(x))
  df['Deck'] = df['TempCabin'].apply(lambda x: x[0])
  df['Side'] = df['TempCabin'].apply(lambda x: x[2])
  df.drop(['TempCabin', 'Cabin'], axis = 1, inplace = True)
  #Destination
  df['Destination'].fillna('Missing', inplace = True)
  #Age
  df['Age'].fillna(df['Age'].mean(), inplace = True)
  #VIP - drop na rows 
  df['VIP'].fillna('Missing', inplace = True)
  #Monetary Spending Columns
  df['RoomService'].fillna(0, inplace = True)
  df['FoodCourt'].fillna(0, inplace = True)
  df['ShoppingMall'].fillna(0, inplace = True)
  df['Spa'].fillna(0, inplace = True)
  df['VRDeck'].fillna(0, inplace = True)
  #Drop Name due to high cardinality
  df.drop('Name', axis = 1, inplace = True)
  #Drop Remaining Rows
  df.dropna(inplace = True)




In [None]:
abt = df.copy()
preprocessing(abt)
abt.info()

In [169]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import seaborn as sns

In [170]:
X = abt.drop(['Transported', 'PassengerId'], axis = 1)
X = pd.get_dummies(X)
y = abt['Transported']


In [171]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [None]:
sns.countplot(x = 'Transported', data = df)

In [None]:
X.head()

**Setup ML Pipeline**

In [173]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [174]:
pipelines = {
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234)),
}

In [175]:
grid = {
    'rf':{
        'randomforestclassifier__n_estimators':[100,200,300]
    },
    'gb':{
        'gradientboostingclassifier__n_estimators':[100,200,300]
    }
}

Setup ML Model

In [176]:
fit_models = {}
for algo, pipeline in pipelines.items():
  model = GridSearchCV(pipeline, grid[algo], n_jobs= 1, cv=10)
  model.fit(X_train, y_train)
  fit_models[algo] = model

In [178]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
for alg, model in fit_models.items():
  yhat = model.predict(X_val)
  accuracy = accuracy_score(y_val, yhat)
  precision = precision_score(y_val, yhat)
  recall = recall_score(y_val, yhat)

  print(f"Metrics for {algo} : accuracy - {accuracy}, precision - {precision}, recall - {recall}")

**Test** **Model**

In [180]:
import pickle

In [181]:
with open('gradientboostingmodel.pk1', 'wb') as f:
  pickle.dump(fit_models['gb'], f)



In [182]:
with open('gradientboostingmodel.pk1', 'rb') as f:
  loaded_model = pickle.load(f)


In [183]:
with open('randomforestclassifier.pk1', 'wb') as f:
  pickle.dump(fit_models['rf'], f)


In [None]:
test_df = pd.read_csv('/content/test.csv')
abt_test = test_df.copy()
preprocessing(abt_test)
abt_test = pd.get_dummies(abt_test.drop('PassengerId', axis = 1))
abt_test

In [185]:
yhat_test = fit_models['gb'].predict(abt_test)

In [186]:
submission = pd.DataFrame([test_df['PassengerId'], yhat_test]).T
submission.columns = ['PassengerId', 'Transported']

In [188]:
submission.to_csv('kaggle_submition.csv', index = False)