<a href="https://colab.research.google.com/github/thedataninja1786/Data-Science/blob/main/Copy_of_Titanic_%7C_Top_1_w_Simple_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive 
drive.mount('/content/drive')
import pandas as pd 
import numpy as np 
import itertools
import random 
import warnings
warnings.filterwarnings("ignore")
random.seed(0)

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/titanic/train.csv')
test_df =  pd.read_csv('/content/drive/MyDrive/titanic/test.csv')

# Join the train and test dataframes so the data preprocessing 
# will be done simultenously in both datasets 
full = train_df.append(test_df, ignore_index=True)

In [None]:
def data_preprocessing(df):
  
  df['Sex'] = df['Sex'].replace(['male'],0)
  df['Sex'] = df['Sex'].replace(['female'],1)
  df['title'] = np.NaN
  full['alone'] = np.NaN
  df['cabin_class'] = np.NaN

  # Identify if a passenger is alone in the ship 
  for i,_ in enumerate(df['alone']):
    if df['SibSp'][i] + df['Parch'][i] == 0:
      df['alone'][i] = 1
    else:
      df['alone'][i] = 0 

  # Handle missing values
  cols = ['SibSp','Parch','Fare','Age']
  for col in cols:
    df[col].fillna(df[col].median(), inplace = True)

  # Feature-engineer the cabin 
  for i,row in enumerate(df['Cabin']):
    # Get cabin 
    df['cabin_class'][i] =  str(row)[:1]

  # Cabin distribution where available 
  cabin_distribution = {}
  count = 0 
  for row in df['cabin_class']:
    if row != 'n':
      count += 1 
      if row not in cabin_distribution:
        cabin_distribution[row] = 1 
      else:
        cabin_distribution[row] +=1 

  # Calculate the probability of being in a sepcific cabin class  
  cabin_pdf = {k:v / count for k, v in cabin_distribution.items()}

  # Calculate the cumulative probability of being in a specific cabin class 
  keys, vals = cabin_distribution.keys(), cabin_pdf.values()
  cabin_cdf = dict(zip(keys, itertools.accumulate(vals)))
  cabin_cdf = sorted(cabin_cdf.items(), key=lambda x: x[1])    

  # Assign randomly cabin-sections to passengers that are missing the cabin 
  # field, based on the probabilities calculated above 
  for i,row in enumerate(df['cabin_class']):
    random_num = random.random()
    if row == 'n':
      if random_num < cabin_cdf[0][1]:
        df['cabin_class'][i] =  cabin_cdf[0][0]
      elif cabin_cdf[0][1] <= random_num < cabin_cdf[1][1]:
        df['cabin_class'][i] =  cabin_cdf[1][0]

      elif cabin_cdf[1][1] <= random_num < cabin_cdf[2][1]:
        df['cabin_class'][i] =  cabin_cdf[2][0]
      
      elif cabin_cdf[2][1] <= random_num < cabin_cdf[3][1]:
        df['cabin_class'][i] =  cabin_cdf[2][0]

      elif cabin_cdf[3][1] <= random_num < cabin_cdf[4][1]:
        df['cabin_class'][i] =  cabin_cdf[3][0]

      elif cabin_cdf[3][1] <= random_num < cabin_cdf[4][1]:
        df['cabin_class'][i] =  cabin_cdf[4][0]

      elif cabin_cdf[4][1] <= random_num < cabin_cdf[5][1]:
        df['cabin_class'][i] =  cabin_cdf[4][0]
      
      elif cabin_cdf[5][1] <= random_num < cabin_cdf[6][1]:
        df['cabin_class'][i] =  cabin_cdf[5][0]

      elif cabin_cdf[6][1] <= random_num < cabin_cdf[7][1]:
        df['cabin_class'][i] =  cabin_cdf[6][0]
      else:
        df['cabin_class'][i] = cabin_cdf[7][0]

  # Perform feature engineering to obtain additional title-info 
  for i,row in enumerate(df['Name']):
    # Get person's title 
    df['title'][i] = row.split(',')[1].split('.')[0]
  

  # Embarked one-hot encoding 
  embarked_dummies = pd.get_dummies(df.Embarked, prefix='Embarked')
  df = pd.concat([df, embarked_dummies], axis=1)

  # Person's title one-hot encoding 
  title_dummies = pd.get_dummies(df.title, prefix='title')
  df = pd.concat([df, title_dummies], axis=1)

  # Cabin class one-hot encoding 
  cabin_class_dummies = pd.get_dummies(df.cabin_class, prefix = 'cabin_class')
  df = pd.concat([df, cabin_class_dummies], axis = 1)


  #Remove unecessary columns 
  del df['Name']
  del df['PassengerId']
  del df['title']
  del df['Embarked']
  del df['Cabin']
  del df['Ticket']
  del df['cabin_class']

  return df 

In [None]:
# Preprocess the data and create the train / test sets 
full = data_preprocessing(full)
X_train = full[:891]
y_train = full['Survived'][:891]
X_test = full[891:]
del X_train['Survived']
del X_test['Survived']

print(X_test.shape)
print(X_train.shape)


In [None]:
# Stack two models for higher accuracy  
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=1000,C=0.175,random_state=42)
LR.fit(X_train, y_train)
lr_training_accuracy = LR.score(X_train, y_train)
predictions = LR.predict(X_test)
lr_predictions = [int(x) for x in predictions]

xgboost = XGBRegressor(learning_rate=0.005,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

xgb = xgboost.fit(X_train,y_train)
xgb_training_accuracy = xgb.score(X_train,y_train)

xgb_predictions = xgb.predict(X_test)
xgb_predictions = [round(x) for x in xgb_predictions]


print("Logistic Regression training accuracy: %.2f%%" % (lr_training_accuracy * 100.0))
print("\nXGB training accuracy: %.2f%%" % (xgb_training_accuracy * 100.0))

In [None]:
# Combine the results from both models  
predictions = [round((lr_pred + xgb_pred) / 2) for lr_pred,xgb_pred in zip(lr_predictions,xgb_predictions)]
# Create submission file 
submission = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':predictions})
submission.to_csv('submission.csv',index = False)